import json import os import numpy as np import pandas as pd import sklearn from matplotlib import pyplot as plt from pipeline import ( load_dataset, filter_data, filter_test_data, prepare_user_data, train_models, evaluate_models, prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2 ) year_str = 'Year' month_str = 'Month' user_str = 'user' split_str = 'split type' data_split_str = 'data percentages' month_split_str = 'month percentages' threshold_str = 'threshold used' with_threshold_str = 'WITH' without_threshold_str = 'WITHOUT' timespan_str = 'time used' hour_timespan_str = '1HR' min_timespan_str = '15MIN' sequence_length_str = 'sequence length' precision_str = 'precision' recall_str = 'recall' f1_string = 'f1 score' model_type_str = 'model type' weak_column_names = ['DayOfWeek_'+day for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] figure_path = 'figures/' # === Configurable Parameters === dataset_path = './Datasets/' DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' result_filename_v1 = './working/evaluation_results.json' result_filename_v2 = './working/evaluation_results_v2.json' SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] VALIDATION_SCENARIO = [(2019, [10, 11, 12])] TEST_SCENARIO = [(2020, [1, 2])] # Jan–Feb 2020 only # === Optional display only === predefined_training_scenarios = { "Scenario 1": {"years_months": [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]}, "Scenario 2": {"years_months": [(2017, list(range(1, 13))), (2018, list(range(1, 13))), (2019, list(range(1, 10)))]} } predefined_validation_scenarios = { "Scenario A": {"years_months": [(2019, [10, 11, 12])]} } def remove_covid_data(df): df = df[~((df[year_str]==2020) & (df[month_str]>2))] return df def split_data_by_month_percentage(df, percentages): train_p, valid_p, test_p = percentages ids = df[[year_str, month_str]].drop_duplicates().sort_values([year_str, month_str]) tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))]) return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner') def split_data_by_userdata_percentage(df, percentages, sample): train_p, valid_p, test_p = percentages tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() for user_id in df[user_str].unique(): user_data = df[df[user_str]==user_id].sample(frac=sample/ 100).sort_values([year_str, month_str]) u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))]) tr = pd.concat([tr, u_tr], ignore_index=True) va = pd.concat([va, u_va], ignore_index=True) te = pd.concat([te, u_te], ignore_index=True) return tr, va, te def main(): # print("=== Training Scenario Setup ===") # display_warning_about_2020_data() # display_warnings_for_scenarios("training", predefined_training_scenarios, predefined_validation_scenarios) # print("\n=== Validation Scenario Setup ===") # display_warning_about_2020_data() # display_warnings_for_scenarios("validation", predefined_training_scenarios, predefined_validation_scenarios) # === Load and preprocess === df = load_dataset(DATA_PATH) ALLUSERS32_15MIN_WITHOUTTHREHOLD = False if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH): ALLUSERS32_15MIN_WITHOUTTHREHOLD = True training_data = filter_data(df, TRAINING_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD) validation_data = filter_data(df, VALIDATION_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD) user_data_train = prepare_user_data(training_data) user_data_val = prepare_user_data(validation_data) # === Train models === best_models = train_models(user_data_train, user_data_val, sequence_lengths=SEQUENCE_LENGTHS) # === Load and evaluate test === test_df = filter_test_data(df, TEST_SCENARIO) evaluate_models(best_models, test_df, SEQUENCE_LENGTHS, OUTPUT_EXCEL_PATH, ALLUSERS32_15MIN_WITHOUTTHREHOLD) print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}") def reduce_columns(df, filename): if min_timespan_str in filename: return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore') else: return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') def load_previous_results(filename): results = pd.DataFrame() if os.path.exists(filename): results = pd.DataFrame(json.load(open(filename))) return results def main_two_v2(model_type): seq_length = range(10,31, 5) for sequence_length in seq_length: for data_filename in os.listdir(dataset_path): timespan_id = hour_timespan_str threshold_id = with_threshold_str if min_timespan_str in data_filename: timespan_id = min_timespan_str if without_threshold_str in data_filename: threshold_id = without_threshold_str results = load_previous_results(result_filename_v2) if len(results) > 0: if len(results[(results[timespan_str]==timespan_id) & (results[threshold_str]==threshold_id) & (results[sequence_length_str]==sequence_length) & (results[model_type_str]==model_type)]) > 0: continue file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10)) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) best_model = train_models_v2(user_data_train, user_data_val, sequence_length=sequence_length, model_type=model_type) results = load_previous_results(result_filename_v2) results = pd.concat([results, evaluate_model_on_test_data(model=best_model, test_df=te, sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type, split_id=data_split_str)], ignore_index=True) results.to_json(result_filename_v2) def main_two_v1(): seq_length = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] results = pd.DataFrame() if os.path.exists(result_filename_v1): results = pd.DataFrame(json.load(open(result_filename_v1))) for sequence_length in seq_length: for data_filename in os.listdir(dataset_path): for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]: for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]: timespan_id = hour_timespan_str threshold_id = with_threshold_str if min_timespan_str in data_filename: timespan_id = min_timespan_str if without_threshold_str in data_filename: threshold_id = without_threshold_str if len(results) > 0: if len(results[(results[split_str]==split_id) & (results[timespan_str]==timespan_id) & (results[threshold_str]==threshold_id) & (results[sequence_length_str]==sequence_length) & (results[model_type_str]==model_type)]) > 0: continue file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) tr,val,te = split_method(df, percentages=(80,10,10)) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length], model_type=model_type) results = pd.concat([results, evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id, sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type)], ignore_index=True) results.to_json(result_filename_v1) # === Evaluation === def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type): user_data = prepare_user_data(test_df) x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length) y_pred = model.predict(x, verbose=0) y_pred_classes = np.argmax(y_pred, axis=1) recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted') precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted') f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted') return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id], sequence_length_str:[sequence_length], model_type_str:[model_type], recall_str:[recall], precision_str:[precision], f1_string:[f1_score]}) def visualise_results_v1(): results = pd.DataFrame(json.load(open(result_filename_v1))) # Month split ist immer schlechter results = results[results[split_str] == data_split_str] with_threshold = results[results[threshold_str] == with_threshold_str] without_threshold = results[results[threshold_str] == without_threshold_str] fig, axes = plt.subplots(2, 3) ax_col_id = 0 ax_row_id = -1 for timespan in [hour_timespan_str,min_timespan_str]: ax_row_id +=1 for model in [model_type_lstm, model_type_bilstm, model_type_gru]: with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)] without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)] ax = axes[ax_row_id, ax_col_id] ax.set_title(model+' '+timespan) ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str) ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str) ax.legend() ax_col_id +=1 ax_col_id %= 3 fig.tight_layout() fig.savefig(figure_path+'v1_results.svg') # Fazit: keine eindeutig besseren Versionen erkennbar def visualise_results_v2(): results = pd.DataFrame(json.load(open(result_filename_v2))) with_threshold = results[results[threshold_str] == with_threshold_str] without_threshold = results[results[threshold_str] == without_threshold_str] fig, axes = plt.subplots(2, 3) ax_col_id = 0 ax_row_id = -1 for timespan in [hour_timespan_str,min_timespan_str]: ax_row_id +=1 for model in [model_type_lstm, model_type_bilstm, model_type_gru]: with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)] without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)] with_sub = with_sub.sort_values(sequence_length_str) without_sub = without_sub.sort_values(sequence_length_str) ax = axes[ax_row_id, ax_col_id] ax.set_title(model+' '+timespan) ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str) ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str) ax.legend() ax_col_id +=1 ax_col_id %= 3 fig.tight_layout() fig.savefig(figure_path+'v2_results.svg') # Fazit: keine eindeutig besseren Versionen erkennbar def test(model_type): sequence_length = 20 data_filename = os.listdir(dataset_path)[0] timespan_id = hour_timespan_str threshold_id = with_threshold_str file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) results = pd.DataFrame() for percentage in [33,66,100]: print('Percentage:', percentage) tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10),sample=percentage) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) best_model = train_models_v2(user_data_train, user_data_val, sequence_length=sequence_length, model_type=model_type) results = pd.concat([results, evaluate_model_on_test_data(model=best_model, test_df=te, sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type, split_id=data_split_str)], ignore_index=True) print(results) if __name__ == "__main__": # main_two_v1() # visualise_results_v1() test(model_type=model_type_gru) # main_two_v2(model_type=model_type_gru) #visualise_results_v2() print('Done')