diff --git a/figures/v1_results.svg b/figures/v1_results.svg new file mode 100644 index 0000000..ecd3870 --- /dev/null +++ b/figures/v1_results.svg @@ -0,0 +1,1877 @@ + + + + + + + + 2025-09-12T09:20:26.818907 + image/svg+xml + + + Matplotlib v3.10.6, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/main.py b/main.py index c49d25d..ee2607c 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import os import numpy as np import pandas as pd import sklearn +from matplotlib import pyplot as plt from pipeline import ( load_dataset, @@ -12,15 +13,21 @@ from pipeline import ( prepare_user_data, train_models, evaluate_models, - prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm + prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2 ) year_str = 'Year' month_str = 'Month' user_str = 'user' split_str = 'split type' +data_split_str = 'data percentages' +month_split_str = 'month percentages' threshold_str = 'threshold used' +with_threshold_str = 'WITH' +without_threshold_str = 'WITHOUT' timespan_str = 'time used' +hour_timespan_str = '1HR' +min_timespan_str = '15MIN' sequence_length_str = 'sequence length' precision_str = 'precision' recall_str = 'recall' @@ -28,12 +35,14 @@ f1_string = 'f1 score' model_type_str = 'model type' weak_column_names = ['DayOfWeek_'+day for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] +figure_path = 'figures/' # === Configurable Parameters === dataset_path = './Datasets/' DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' -result_filename = './working/evaluation_results.json' +result_filename_v1 = './working/evaluation_results.json' +result_filename_v2 = './working/evaluation_results_v2.json' SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] @@ -104,26 +113,80 @@ def main(): def reduce_columns(df, filename): - if '15MIN' in filename: + if min_timespan_str in filename: return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore') else: return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') -def main_two(): +def load_previous_results(filename): results = pd.DataFrame() - if os.path.exists(result_filename): - results = pd.DataFrame(json.load(open(result_filename))) - for sequence_length in SEQUENCE_LENGTHS: + if os.path.exists(filename): + results = pd.DataFrame(json.load(open(filename))) + return results + +def main_two_v2(model_type): + seq_length = range(20,31, 10) + for sequence_length in seq_length: + for data_filename in os.listdir(dataset_path): + timespan_id = hour_timespan_str + threshold_id = with_threshold_str + if min_timespan_str in data_filename: + timespan_id = min_timespan_str + if without_threshold_str in data_filename: + threshold_id = without_threshold_str + + results = load_previous_results(result_filename_v2) + if len(results) > 0: + if len(results[(results[timespan_str]==timespan_id) & + (results[threshold_str]==threshold_id) & + (results[sequence_length_str]==sequence_length) & + (results[model_type_str]==model_type)]) > 0: + continue + + file_path = os.path.join(dataset_path, data_filename) + df = load_dataset(file_path) + df = remove_covid_data(df) + + tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10)) + tr = reduce_columns(tr, data_filename) + val = reduce_columns(val, data_filename) + te = reduce_columns(te, data_filename) + + user_data_train = prepare_user_data(tr) + user_data_val = prepare_user_data(val) + + best_models = train_models_v2(user_data_train, user_data_val, + sequence_length=sequence_length, + model_type=model_type) + + results = load_previous_results(result_filename_v2) + results = pd.concat([results, + evaluate_model_on_test_data(model=best_models[sequence_length]['model'], + test_df=te, + sequence_length=sequence_length, + time_span_id=timespan_id, + threshold_id=threshold_id, + model_type=model_type, + split_id=data_split_str)], + ignore_index=True) + results.to_json(result_filename_v2) + +def main_two_v1(): + seq_length = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] + results = pd.DataFrame() + if os.path.exists(result_filename_v1): + results = pd.DataFrame(json.load(open(result_filename_v1))) + for sequence_length in seq_length: for data_filename in os.listdir(dataset_path): - for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]: + for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]: for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]: - timespan_id = '1HR' - threshold_id = 'WITH' - if '15MIN' in data_filename: - timespan_id = '15MIN' - if 'WITHOUT' in data_filename: - threshold_id = 'WITHOUT' + timespan_id = hour_timespan_str + threshold_id = with_threshold_str + if min_timespan_str in data_filename: + timespan_id = min_timespan_str + if without_threshold_str in data_filename: + threshold_id = without_threshold_str if len(results) > 0: if len(results[(results[split_str]==split_id) & (results[timespan_str]==timespan_id) & @@ -152,8 +215,7 @@ def main_two(): time_span_id=timespan_id, threshold_id=threshold_id, model_type=model_type)], ignore_index=True) - results.to_json(result_filename) - + results.to_json(result_filename_v1) # === Evaluation === def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type): @@ -171,7 +233,34 @@ def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, thresh model_type_str:[model_type], recall_str:[recall], precision_str:[precision], f1_string:[f1_score]}) +def visualise_results_v1(): + results = pd.DataFrame(json.load(open(result_filename_v1))) + # Month split ist immer schlechter + results = results[results[split_str] == data_split_str] + with_threshold = results[results[threshold_str] == with_threshold_str] + without_threshold = results[results[threshold_str] == without_threshold_str] + fig, axes = plt.subplots(2, 3) + ax_col_id = 0 + ax_row_id = -1 + for timespan in [hour_timespan_str,min_timespan_str]: + ax_row_id +=1 + for model in [model_type_lstm, model_type_bilstm, model_type_gru]: + with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)] + without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)] + ax = axes[ax_row_id, ax_col_id] + ax.set_title(model+' '+timespan) + ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str) + ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str) + ax.legend() + ax_col_id +=1 + ax_col_id %= 3 + fig.tight_layout() + fig.savefig(figure_path+'v1_results.svg') + # Fazit: keine eindeutig besseren Versionen erkennbar + if __name__ == "__main__": - main_two() + # main_two_v1() + # visualise_results_v1() + main_two_v2(model_type=model_type_gru) print('Done') diff --git a/pipeline.py b/pipeline.py index a831131..f205a88 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,7 +1,10 @@ +import keras_tuner import numpy as np import pandas as pd import shutil import os + +from keras.src.metrics import F1Score from pandas import ExcelWriter import keras_tuner as kt from tensorflow.keras.models import Sequential @@ -11,7 +14,7 @@ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping from keras_tuner import RandomSearch from sklearn.metrics import accuracy_score -epochs = 50 +epochs = 30 model_type_gru = 'GRU' model_type_lstm = 'LSTM' model_type_bilstm = 'BiLSTM' @@ -139,6 +142,58 @@ def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./workin return best_models +# === Training & Validation === +def train_models_v2(user_data, user_data_val, sequence_length, model_type): + tuner_dir = "./working/tuner" + + early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) + lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2) + + shutil.rmtree(tuner_dir, ignore_errors=True) + + x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length) + x_val, y_val = prepare_data_for_model(user_data=user_data_val, sequence_length=sequence_length) + + n_features = x.shape[2] + users = list(user_data.keys()) + + def build_model(hp): + model = Sequential() + if model_type==model_type_bilstm: + model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2), + input_shape=(sequence_length, n_features)))) + if model_type==model_type_lstm: + model.add(LSTM(units=hp.Int('units', 32, 256, step=2), + input_shape=(sequence_length, n_features))) + if model_type==model_type_gru: + model.add(GRU(units=hp.Int('units', 32, 256, step=2), + input_shape=(sequence_length, n_features))) + model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1))) + model.add(Dense(len(users), activation='softmax')) + model.compile( + optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])), + loss='sparse_categorical_crossentropy', + metrics=['accuracy'] + ) + return model + + tuner = RandomSearch( + build_model, + objective='val_loss', + max_trials=100, + directory=tuner_dir, + ) + + tuner.search(x, y, epochs=epochs, validation_data=(x_val, y_val), + callbacks=[early_stopping, lr_scheduler]) + + best_hps = tuner.get_best_hyperparameters(1)[0] + best_model = tuner.hypermodel.build(best_hps) + best_model.fit(x, y, epochs=epochs, validation_data=(x_val, y_val), + callbacks=[early_stopping, lr_scheduler]) + return best_model + + # === Evaluation === def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD): print("\n🧪 Evaluating on Test Data...") diff --git a/requirements.txt b/requirements.txt index 833dbb3..f0a4af0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,3 +46,5 @@ tzdata==2025.2 urllib3==2.5.0 Werkzeug==3.1.3 wrapt==1.17.2 + +matplotlib~=3.10.6 \ No newline at end of file