import json
import os

import numpy as np
import pandas as pd
import sklearn
from matplotlib import pyplot as plt

from pipeline import (
    load_dataset,
    filter_data,
    filter_test_data,
    prepare_user_data,
    train_models,
    evaluate_models,
    prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2
)

year_str = 'Year'
month_str = 'Month'
user_str = 'user'
split_str = 'split type'
data_split_str = 'data percentages'
month_split_str = 'month percentages'
threshold_str = 'threshold used'
with_threshold_str = 'WITH'
without_threshold_str = 'WITHOUT'
timespan_str = 'time used'
hour_timespan_str = '1HR'
min_timespan_str = '15MIN'
sequence_length_str = 'sequence length'
precision_str = 'precision'
recall_str = 'recall'
f1_string = 'f1 score'
model_type_str = 'model type'
weak_column_names = ['DayOfWeek_'+day for day in
                     ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
figure_path = 'figures/'

# === Configurable Parameters ===
dataset_path = './Datasets/'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
result_filename_v1 = './working/evaluation_results.json'
result_filename_v2 = './working/evaluation_results_v2.json'
SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5]  # You can add more: [20, 25, 30]

TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
VALIDATION_SCENARIO = [(2019, [10, 11, 12])]
TEST_SCENARIO = [(2020, [1, 2])]  # Jan–Feb 2020 only

# === Optional display only ===
predefined_training_scenarios = {
    "Scenario 1": {"years_months": [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]},
    "Scenario 2": {"years_months": [(2017, list(range(1, 13))), (2018, list(range(1, 13))), (2019, list(range(1, 10)))]}
}
predefined_validation_scenarios = {
    "Scenario A": {"years_months": [(2019, [10, 11, 12])]}
}

def remove_covid_data(df):
    df = df[~((df[year_str]==2020) & (df[month_str]>2))]
    return df

def split_data_by_month_percentage(df, percentages):
    train_p, valid_p, test_p = percentages
    ids = df[[year_str, month_str]].drop_duplicates().sort_values([year_str, month_str])
    tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
    return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')

def split_data_by_userdata_percentage(df, percentages, sample):
    train_p, valid_p, test_p = percentages
    tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for user_id in df[user_str].unique():
        user_data = df[df[user_str]==user_id].sample(frac=sample/ 100).sort_values([year_str, month_str])
        u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))])
        tr = pd.concat([tr, u_tr], ignore_index=True)
        va = pd.concat([va, u_va], ignore_index=True)
        te = pd.concat([te, u_te], ignore_index=True)
    return tr, va, te


def main():
    # print("=== Training Scenario Setup ===")
    # display_warning_about_2020_data()
    # display_warnings_for_scenarios("training", predefined_training_scenarios, predefined_validation_scenarios)

    # print("\n=== Validation Scenario Setup ===")
    # display_warning_about_2020_data()
    # display_warnings_for_scenarios("validation", predefined_training_scenarios, predefined_validation_scenarios)

    # === Load and preprocess ===
    df = load_dataset(DATA_PATH)

    ALLUSERS32_15MIN_WITHOUTTHREHOLD = False
    if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH):
        ALLUSERS32_15MIN_WITHOUTTHREHOLD = True

    training_data = filter_data(df, TRAINING_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
    validation_data = filter_data(df, VALIDATION_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD)

    user_data_train = prepare_user_data(training_data)
    user_data_val = prepare_user_data(validation_data)

    # === Train models ===
    best_models = train_models(user_data_train, user_data_val, sequence_lengths=SEQUENCE_LENGTHS)

    # === Load and evaluate test ===
    test_df = filter_test_data(df, TEST_SCENARIO)
    evaluate_models(best_models, test_df, SEQUENCE_LENGTHS, OUTPUT_EXCEL_PATH, ALLUSERS32_15MIN_WITHOUTTHREHOLD)

    print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}")


def reduce_columns(df, filename):
    if min_timespan_str in filename:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
    else:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
    

def load_previous_results(filename):
    results = pd.DataFrame()
    if os.path.exists(filename):
        results = pd.DataFrame(json.load(open(filename)))
    return results

def main_two_v2(model_type):
    seq_length = range(10,31, 5)
    for sequence_length in seq_length:
        for data_filename in os.listdir(dataset_path):
            timespan_id = hour_timespan_str
            threshold_id = with_threshold_str
            if min_timespan_str in data_filename:
                timespan_id = min_timespan_str
            if without_threshold_str in data_filename:
                threshold_id = without_threshold_str

            results = load_previous_results(result_filename_v2)
            if len(results) > 0:
                if len(results[(results[timespan_str]==timespan_id) &
                               (results[threshold_str]==threshold_id) &
                               (results[sequence_length_str]==sequence_length) &
                               (results[model_type_str]==model_type)]) > 0:
                    continue

            file_path = os.path.join(dataset_path, data_filename)
            df = load_dataset(file_path)
            df = remove_covid_data(df)

            tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10))
            tr = reduce_columns(tr, data_filename)
            val = reduce_columns(val, data_filename)
            te = reduce_columns(te, data_filename)

            user_data_train = prepare_user_data(tr)
            user_data_val = prepare_user_data(val)

            best_model = train_models_v2(user_data_train, user_data_val,
                                          sequence_length=sequence_length,
                                          model_type=model_type)

            results = load_previous_results(result_filename_v2)
            results = pd.concat([results,
                                 evaluate_model_on_test_data(model=best_model,
                                                             test_df=te,
                                                             sequence_length=sequence_length,
                                                             time_span_id=timespan_id,
                                                             threshold_id=threshold_id,
                                                             model_type=model_type,
                                                             split_id=data_split_str)],
                                ignore_index=True)
            results.to_json(result_filename_v2)

def main_two_v1():
    seq_length = [30, 25, 20, 15, 10, 5]  # You can add more: [20, 25, 30]
    results = pd.DataFrame()
    if os.path.exists(result_filename_v1):
        results = pd.DataFrame(json.load(open(result_filename_v1)))
    for sequence_length in seq_length:
        for data_filename in os.listdir(dataset_path):
            for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]:
                for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]:
                    timespan_id = hour_timespan_str
                    threshold_id = with_threshold_str
                    if min_timespan_str in data_filename:
                        timespan_id = min_timespan_str
                    if without_threshold_str in data_filename:
                        threshold_id = without_threshold_str
                    if len(results) > 0:
                        if len(results[(results[split_str]==split_id) &
                                       (results[timespan_str]==timespan_id) &
                                       (results[threshold_str]==threshold_id) &
                                       (results[sequence_length_str]==sequence_length) &
                                       (results[model_type_str]==model_type)]) > 0:
                            continue

                    file_path = os.path.join(dataset_path, data_filename)
                    df = load_dataset(file_path)
                    df = remove_covid_data(df)
                    tr,val,te = split_method(df, percentages=(80,10,10))
                    tr = reduce_columns(tr, data_filename)
                    val = reduce_columns(val, data_filename)
                    te = reduce_columns(te, data_filename)

                    user_data_train = prepare_user_data(tr)
                    user_data_val = prepare_user_data(val)

                    best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length], model_type=model_type)

                    results = pd.concat([results,
                                         evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
                                                                     test_df=te, split_id=split_id,
                                                                     sequence_length=sequence_length,
                                                                     time_span_id=timespan_id,
                                                                     threshold_id=threshold_id,
                                                                     model_type=model_type)], ignore_index=True)
                    results.to_json(result_filename_v1)

# === Evaluation ===
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type):
    user_data = prepare_user_data(test_df)
    x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)

    y_pred = model.predict(x, verbose=0)
    y_pred_classes = np.argmax(y_pred, axis=1)

    recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted')
    precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted')
    f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted')
    return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id],
                         sequence_length_str:[sequence_length],
                         model_type_str:[model_type], recall_str:[recall],
                         precision_str:[precision], f1_string:[f1_score]})

def visualise_results_v1():
    results = pd.DataFrame(json.load(open(result_filename_v1)))
    # Month split ist immer schlechter
    results = results[results[split_str] == data_split_str]
    with_threshold = results[results[threshold_str] == with_threshold_str]
    without_threshold = results[results[threshold_str] == without_threshold_str]
    fig, axes = plt.subplots(2, 3)
    ax_col_id = 0
    ax_row_id = -1
    for timespan in [hour_timespan_str,min_timespan_str]:
        ax_row_id +=1
        for model in [model_type_lstm, model_type_bilstm, model_type_gru]:
            with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)]
            without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)]
            ax = axes[ax_row_id, ax_col_id]
            ax.set_title(model+' '+timespan)
            ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str)
            ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str)
            ax.legend()
            ax_col_id +=1
            ax_col_id %= 3
    fig.tight_layout()
    fig.savefig(figure_path+'v1_results.svg')
    # Fazit: keine eindeutig besseren Versionen erkennbar


def visualise_results_v2():
    results = pd.DataFrame(json.load(open(result_filename_v2)))
    with_threshold = results[results[threshold_str] == with_threshold_str]
    without_threshold = results[results[threshold_str] == without_threshold_str]
    fig, axes = plt.subplots(2, 3)
    ax_col_id = 0
    ax_row_id = -1
    for timespan in [hour_timespan_str,min_timespan_str]:
        ax_row_id +=1
        for model in [model_type_lstm, model_type_bilstm, model_type_gru]:
            with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)]
            without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)]
            with_sub = with_sub.sort_values(sequence_length_str)
            without_sub = without_sub.sort_values(sequence_length_str)
            ax = axes[ax_row_id, ax_col_id]
            ax.set_title(model+' '+timespan)
            ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str)
            ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str)
            ax.legend()
            ax_col_id +=1
            ax_col_id %= 3
    fig.tight_layout()
    fig.savefig(figure_path+'v2_results.svg')
    # Fazit: keine eindeutig besseren Versionen erkennbar


def test(model_type):
    sequence_length = 20
    data_filename = os.listdir(dataset_path)[0]
    timespan_id = hour_timespan_str
    threshold_id = with_threshold_str

    file_path = os.path.join(dataset_path, data_filename)
    df = load_dataset(file_path)
    df = remove_covid_data(df)
    results = pd.DataFrame()

    for percentage in [33,66,100]:
        print('Percentage:', percentage)
        tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10),sample=percentage)
        tr = reduce_columns(tr, data_filename)
        val = reduce_columns(val, data_filename)
        te = reduce_columns(te, data_filename)

        user_data_train = prepare_user_data(tr)
        user_data_val = prepare_user_data(val)

        best_model = train_models_v2(user_data_train, user_data_val,
                                      sequence_length=sequence_length,
                                      model_type=model_type)

        results = pd.concat([results,
                             evaluate_model_on_test_data(model=best_model,
                                                         test_df=te,
                                                         sequence_length=sequence_length,
                                                         time_span_id=timespan_id,
                                                         threshold_id=threshold_id,
                                                         model_type=model_type,
                                                         split_id=data_split_str)],
                            ignore_index=True)
    print(results)


if __name__ == "__main__":
    # main_two_v1()
    # visualise_results_v1()
    test(model_type=model_type_gru)
   # main_two_v2(model_type=model_type_gru)
    #visualise_results_v2()
    print('Done')