import json import os import numpy as np import pandas as pd import sklearn from keras.src.regularizers import L1L2 from matplotlib import pyplot as plt from pandas import DataFrame from sklearn.dummy import DummyClassifier from sklearn.preprocessing import MinMaxScaler from pipeline import ( load_dataset, filter_data, filter_test_data, prepare_user_data, train_models, evaluate_models, prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model, eval_metrics ) year_str = 'Year' month_str = 'Month' date_str = 'Date' time_str = 'Time' day_of_week_str = 'DayOfWeek' user_str = 'user' split_str = 'split type' data_split_str = 'data percentages' month_split_str = 'month percentages' threshold_str = 'threshold used' with_threshold_str = 'WITH' without_threshold_str = 'WITHOUT' timespan_str = 'time used' hour_timespan_str = '1HR' min_timespan_str = '15MIN' sequence_length_str = 'sequence length' accuracy_str = 'accuracy' precision_str = 'precision' recall_str = 'recall' f1_string = 'f1 score' model_type_str = 'model type' week_column_names = ['DayOfWeek_' + day for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] figure_path = 'figures/' # === Configurable Parameters === dataset_path = './Datasets/' dataset_hrs_path = './Datasets/hours.json' dataset_min_path = './Datasets/minutes.json' DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' result_filename_v1 = './working/evaluation_results.json' result_filename_v2 = './working/evaluation_results_v2.json' SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] VALIDATION_SCENARIO = [(2019, [10, 11, 12])] TEST_SCENARIO = [(2020, [1, 2])] # Jan–Feb 2020 only # === Optional display only === predefined_training_scenarios = { "Scenario 1": {"years_months": [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]}, "Scenario 2": {"years_months": [(2017, list(range(1, 13))), (2018, list(range(1, 13))), (2019, list(range(1, 10)))]} } predefined_validation_scenarios = { "Scenario A": {"years_months": [(2019, [10, 11, 12])]} } def remove_covid_data(df): df = df[~((df[year_str]==2020) & (df[month_str]>2))] return df def split_data_by_month_percentage(df, percentages): train_p, valid_p, test_p = percentages ids = df[[year_str, month_str]].drop_duplicates().sort_values([year_str, month_str]) tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))]) return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner') def split_data_by_userdata_percentage(df, percentages, sample=100): train_p, valid_p, test_p = percentages tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() for user_id in df[user_str].unique(): user_data = df[df[user_str]==user_id].sample(frac=sample/ 100).sort_values([year_str, month_str]) u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))]) tr = pd.concat([tr, u_tr], ignore_index=True) va = pd.concat([va, u_va], ignore_index=True) te = pd.concat([te, u_te], ignore_index=True) return tr, va, te def main(): # print("=== Training Scenario Setup ===") # display_warning_about_2020_data() # display_warnings_for_scenarios("training", predefined_training_scenarios, predefined_validation_scenarios) # print("\n=== Validation Scenario Setup ===") # display_warning_about_2020_data() # display_warnings_for_scenarios("validation", predefined_training_scenarios, predefined_validation_scenarios) # === Load and preprocess === df = load_dataset(DATA_PATH) ALLUSERS32_15MIN_WITHOUTTHREHOLD = False if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH): ALLUSERS32_15MIN_WITHOUTTHREHOLD = True training_data = filter_data(df, TRAINING_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD) validation_data = filter_data(df, VALIDATION_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD) user_data_train = prepare_user_data(training_data) user_data_val = prepare_user_data(validation_data) # === Train models === best_models = train_models(user_data_train, user_data_val, sequence_lengths=SEQUENCE_LENGTHS) # === Load and evaluate test === test_df = filter_test_data(df, TEST_SCENARIO) evaluate_models(best_models, test_df, SEQUENCE_LENGTHS, OUTPUT_EXCEL_PATH, ALLUSERS32_15MIN_WITHOUTTHREHOLD) print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}") def reduce_columns(df, filename): if min_timespan_str in filename: return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'] + week_column_names, errors='ignore') else: return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') def reduce_columns_v3(df): return df.drop(columns=[month_str, year_str, date_str]) def load_previous_results(filename): results = pd.DataFrame() if os.path.exists(filename): results = pd.DataFrame(json.load(open(filename))) return results def main_two_v2(model_type): seq_length = range(10,31, 5) for sequence_length in seq_length: for data_filename in os.listdir(dataset_path): timespan_id = hour_timespan_str threshold_id = with_threshold_str if min_timespan_str in data_filename: timespan_id = min_timespan_str if without_threshold_str in data_filename: threshold_id = without_threshold_str results = load_previous_results(result_filename_v2) if len(results) > 0: if len(results[(results[timespan_str]==timespan_id) & (results[threshold_str]==threshold_id) & (results[sequence_length_str]==sequence_length) & (results[model_type_str]==model_type)]) > 0: continue file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10)) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) best_model = train_models_v2(user_data_train, user_data_val, sequence_length=sequence_length, model_type=model_type) results = load_previous_results(result_filename_v2) results = pd.concat([results, evaluate_model_on_test_data(model=best_model, test_df=te, sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type, split_id=data_split_str)], ignore_index=True) results.to_json(result_filename_v2) def main_two_v1(): seq_length = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] results = pd.DataFrame() if os.path.exists(result_filename_v1): results = pd.DataFrame(json.load(open(result_filename_v1))) for sequence_length in seq_length: for data_filename in os.listdir(dataset_path): for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]: for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]: timespan_id = hour_timespan_str threshold_id = with_threshold_str if min_timespan_str in data_filename: timespan_id = min_timespan_str if without_threshold_str in data_filename: threshold_id = without_threshold_str if len(results) > 0: if len(results[(results[split_str]==split_id) & (results[timespan_str]==timespan_id) & (results[threshold_str]==threshold_id) & (results[sequence_length_str]==sequence_length) & (results[model_type_str]==model_type)]) > 0: continue file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) tr,val,te = split_method(df, percentages=(80,10,10)) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length], model_type=model_type) results = pd.concat([results, evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id, sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type)], ignore_index=True) results.to_json(result_filename_v1) # === Evaluation === def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type): user_data = prepare_user_data(test_df) x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length) y_pred = model.predict(x, verbose=0) y_pred_classes = np.argmax(y_pred, axis=1) recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted') precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted') f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted') return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id], sequence_length_str:[sequence_length], model_type_str:[model_type], recall_str:[recall], precision_str:[precision], f1_string:[f1_score]}) def visualise_results_v1(): results = pd.DataFrame(json.load(open(result_filename_v1))) # Month split ist immer schlechter results = results[results[split_str] == data_split_str] with_threshold = results[results[threshold_str] == with_threshold_str] without_threshold = results[results[threshold_str] == without_threshold_str] fig, axes = plt.subplots(2, 3) ax_col_id = 0 ax_row_id = -1 for timespan in [hour_timespan_str,min_timespan_str]: ax_row_id +=1 for model in [model_type_lstm, model_type_bilstm, model_type_gru]: with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)] without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)] ax = axes[ax_row_id, ax_col_id] ax.set_title(model+' '+timespan) ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str) ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str) ax.legend() ax_col_id +=1 ax_col_id %= 3 fig.tight_layout() fig.savefig(figure_path+'v1_results.svg') # Fazit: keine eindeutig besseren Versionen erkennbar def visualise_results_v2(): results = pd.DataFrame(json.load(open(result_filename_v2))) with_threshold = results[results[threshold_str] == with_threshold_str] without_threshold = results[results[threshold_str] == without_threshold_str] fig, axes = plt.subplots(2, 3) ax_col_id = 0 ax_row_id = -1 for timespan in [hour_timespan_str,min_timespan_str]: ax_row_id +=1 for model in [model_type_lstm, model_type_bilstm, model_type_gru]: with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)] without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)] with_sub = with_sub.sort_values(sequence_length_str) without_sub = without_sub.sort_values(sequence_length_str) ax = axes[ax_row_id, ax_col_id] ax.set_title(model+' '+timespan) ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str) ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str) ax.legend() ax_col_id +=1 ax_col_id %= 3 fig.tight_layout() fig.savefig(figure_path+'v2_results.svg') # Fazit: keine eindeutig besseren Versionen erkennbar def test(model_type): sequence_length = 20 data_filename = os.listdir(dataset_path)[0] timespan_id = hour_timespan_str threshold_id = with_threshold_str file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) results = pd.DataFrame() for percentage in [33,66,100]: print('Percentage:', percentage) tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10),sample=percentage) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) best_model = train_models_v2(user_data_train, user_data_val, sequence_length=sequence_length, model_type=model_type) results = pd.concat([results, evaluate_model_on_test_data(model=best_model, test_df=te, sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type, split_id=data_split_str)], ignore_index=True) print(results) def manual_tuning(model_type): # load dataset sequence_length = 20 data_filename = 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx' timespan_id = min_timespan_str threshold_id = with_threshold_str file_path = os.path.join(dataset_path, data_filename) df = load_dataset(file_path) df = remove_covid_data(df) tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=100) tr = reduce_columns(tr, data_filename) val = reduce_columns(val, data_filename) te = reduce_columns(te, data_filename) user_data_train = prepare_user_data(tr) user_data_val = prepare_user_data(val) # fit and evaluate model # config repeats = 3 n_batch = 1024 n_epochs = 500 n_neurons = 16 l_rate = 1e-4 reg = L1L2(l1=0.0, l2=0.0) history_list = list() # run diagnostic tests for i in range(repeats): history = train_one_model(user_data_train, user_data_val, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length=sequence_length, model_type=model_type) history_list.append(history) for metric in ['p', 'r', 'f1']: for history in history_list: plt.plot(history['train_'+metric], color='blue') plt.plot(history['test_'+metric], color='orange') plt.savefig(figure_path+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+ str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png') plt.clf() print('Done') def manual_tuning_v3(model_type): # TODO: hrs/min + different sequence lengths sequence_length = 20 tr, val, te = get_prepared_data_v3(dataset_hrs_path) # fit and evaluate model # config repeats = 3 n_batch = 1024 n_epochs = 500 n_neurons = 16 l_rate = 1e-4 history_list = list() # run diagnostic tests for i in range(repeats): history = train_one_model(tr, val, n_batch, n_epochs, n_neurons, l_rate, sequence_length=sequence_length, model_type=model_type) history_list.append(history) for metric in ['p', 'r', 'f1']: for history in history_list: plt.plot(history['train_'+metric], color='blue') plt.plot(history['test_'+metric], color='orange') plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+ str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png') plt.clf() print('Done') def calculate_baselines(): file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'), (min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'), (min_timespan_str, without_threshold_str, 'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'), (hour_timespan_str, without_threshold_str, 'ALLUSERS_32_1HR_WITHOUT_THRESHOLD.xlsx'), ] baseline_res = pd.DataFrame() for timespan_id, threshold_id, filename in file_combinations: file_path = os.path.join(dataset_path, filename) df = load_dataset(file_path) df = remove_covid_data(df) _, _, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=20) te = reduce_columns(te, filename) user_data_te = prepare_user_data(te) for sequence_length in range(5,30, 5): x, y = prepare_data_for_model(user_data=user_data_te, sequence_length=sequence_length) for strategy in ['most_frequent', 'stratified', 'uniform']: cls = DummyClassifier(strategy=strategy) cls.fit(x,y) y_pred = cls.predict(x) acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred) baseline_res = pd.concat([baseline_res, DataFrame({ 'strategy':[strategy], threshold_str:[threshold_id], timespan_str:[timespan_id], sequence_length_str:[sequence_length], accuracy_str:[acc],precision_str:[p],recall_str:[r], f1_string:f1})], ignore_index=True) baseline_res.to_json('baseline_results.json') print('Done') def get_prepared_data_v3(filename, sample=100): df = pd.read_json(filename) df = remove_covid_data(df) tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample) tr = reduce_columns_v3(tr) val = reduce_columns_v3(val) te = reduce_columns_v3(te) scaler = MinMaxScaler() scaler.fit(tr.drop(columns=[user_str])) return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te) def scale_dataset(scaler, df): y = df[user_str] x_scaled = scaler.transform(df.drop(columns=[user_str])) df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1) df_scaled.columns = df.columns return prepare_user_data(df) def calculate_baselines_v3(): file_combinations = [(hour_timespan_str, dataset_hrs_path), (min_timespan_str, dataset_min_path), ] baseline_res = pd.DataFrame() for timespan_id, filename in file_combinations: _, _, te = get_prepared_data_v3(filename) for sequence_length in range(5,30, 5): x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length) for strategy in ['most_frequent', 'stratified', 'uniform']: cls = DummyClassifier(strategy=strategy) cls.fit(x,y) y_pred = cls.predict(x) acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred) baseline_res = pd.concat([baseline_res, DataFrame({ 'strategy':[strategy], timespan_str:[timespan_id], sequence_length_str:[sequence_length], accuracy_str:[acc],precision_str:[p],recall_str:[r], f1_string:f1})], ignore_index=True) baseline_res.to_json('baseline_results_v3.json') print('Done') if __name__ == "__main__": # main_two_v1() # visualise_results_v1() #test(model_type=model_type_gru) # main_two_v2(model_type=model_type_gru) #visualise_results_v2() #manual_tuning(model_type=model_type_lstm) #calculate_baselines() calculate_baselines_v3() print('Done')