added new version to run with adjusted training process

2 months ago · 93211b811e
4 changed files with 2041 additions and 18 deletions
--- a/figures/v1_results.svg
+++ b/figures/v1_results.svg
--- a/main.py
+++ b/main.py
@ -4,6 +4,7 @@ import os
 import numpy as np
 import pandas as pd
 import sklearn
+from matplotlib import pyplot as plt

 from pipeline import (
    load_dataset,
@ -12,15 +13,21 @@ from pipeline import (
    prepare_user_data,
    train_models,
    evaluate_models,
-    prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm
+    prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2
 )

 year_str = 'Year'
 month_str = 'Month'
 user_str = 'user'
 split_str = 'split type'
+data_split_str = 'data percentages'
+month_split_str = 'month percentages'
 threshold_str = 'threshold used'
+with_threshold_str = 'WITH'
+without_threshold_str = 'WITHOUT'
 timespan_str = 'time used'
+hour_timespan_str = '1HR'
+min_timespan_str = '15MIN'
 sequence_length_str = 'sequence length'
 precision_str = 'precision'
 recall_str = 'recall'
@ -28,12 +35,14 @@ f1_string = 'f1 score'
 model_type_str = 'model type'
 weak_column_names = ['DayOfWeek_'+day for day in
                     ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
+figure_path = 'figures/'

 # === Configurable Parameters ===
 dataset_path = './Datasets/'
 DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
 OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
-result_filename = './working/evaluation_results.json'
+result_filename_v1 = './working/evaluation_results.json'
+result_filename_v2 = './working/evaluation_results_v2.json'
 SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5]  # You can add more: [20, 25, 30]

 TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
@ -104,26 +113,80 @@ def main():


 def reduce_columns(df, filename):
-    if '15MIN' in filename:
+    if min_timespan_str in filename:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
    else:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
    

-def main_two():
+def load_previous_results(filename):
    results = pd.DataFrame()
-    if os.path.exists(result_filename):
-        results = pd.DataFrame(json.load(open(result_filename)))
-    for sequence_length in SEQUENCE_LENGTHS:
+    if os.path.exists(filename):
+        results = pd.DataFrame(json.load(open(filename)))
+    return results
+
+def main_two_v2(model_type):
+    seq_length = range(20,31, 10)
+    for sequence_length in seq_length:
+        for data_filename in os.listdir(dataset_path):
+            timespan_id = hour_timespan_str
+            threshold_id = with_threshold_str
+            if min_timespan_str in data_filename:
+                timespan_id = min_timespan_str
+            if without_threshold_str in data_filename:
+                threshold_id = without_threshold_str
+
+            results = load_previous_results(result_filename_v2)
+            if len(results) > 0:
+                if len(results[(results[timespan_str]==timespan_id) &
+                               (results[threshold_str]==threshold_id) &
+                               (results[sequence_length_str]==sequence_length) &
+                               (results[model_type_str]==model_type)]) > 0:
+                    continue
+
+            file_path = os.path.join(dataset_path, data_filename)
+            df = load_dataset(file_path)
+            df = remove_covid_data(df)
+
+            tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10))
+            tr = reduce_columns(tr, data_filename)
+            val = reduce_columns(val, data_filename)
+            te = reduce_columns(te, data_filename)
+
+            user_data_train = prepare_user_data(tr)
+            user_data_val = prepare_user_data(val)
+
+            best_models = train_models_v2(user_data_train, user_data_val,
+                                          sequence_length=sequence_length,
+                                          model_type=model_type)
+
+            results = load_previous_results(result_filename_v2)
+            results = pd.concat([results,
+                                 evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
+                                                             test_df=te,
+                                                             sequence_length=sequence_length,
+                                                             time_span_id=timespan_id,
+                                                             threshold_id=threshold_id,
+                                                             model_type=model_type,
+                                                             split_id=data_split_str)],
+                                ignore_index=True)
+            results.to_json(result_filename_v2)
+
+def main_two_v1():
+    seq_length = [30, 25, 20, 15, 10, 5]  # You can add more: [20, 25, 30]
+    results = pd.DataFrame()
+    if os.path.exists(result_filename_v1):
+        results = pd.DataFrame(json.load(open(result_filename_v1)))
+    for sequence_length in seq_length:
        for data_filename in os.listdir(dataset_path):
-            for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]:
+            for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]:
                for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]:
-                    timespan_id = '1HR'
-                    threshold_id = 'WITH'
-                    if '15MIN' in data_filename:
-                        timespan_id = '15MIN'
-                    if 'WITHOUT' in data_filename:
-                        threshold_id = 'WITHOUT'
+                    timespan_id = hour_timespan_str
+                    threshold_id = with_threshold_str
+                    if min_timespan_str in data_filename:
+                        timespan_id = min_timespan_str
+                    if without_threshold_str in data_filename:
+                        threshold_id = without_threshold_str
                    if len(results) > 0:
                        if len(results[(results[split_str]==split_id) &
                                       (results[timespan_str]==timespan_id) &
@ -152,8 +215,7 @@ def main_two():
                                                                     time_span_id=timespan_id,
                                                                     threshold_id=threshold_id,
                                                                     model_type=model_type)], ignore_index=True)
-                    results.to_json(result_filename)
-
+                    results.to_json(result_filename_v1)

 # === Evaluation ===
 def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type):
@ -171,7 +233,34 @@ def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, thresh
                         model_type_str:[model_type], recall_str:[recall],
                         precision_str:[precision], f1_string:[f1_score]})

+def visualise_results_v1():
+    results = pd.DataFrame(json.load(open(result_filename_v1)))
+    # Month split ist immer schlechter
+    results = results[results[split_str] == data_split_str]
+    with_threshold = results[results[threshold_str] == with_threshold_str]
+    without_threshold = results[results[threshold_str] == without_threshold_str]
+    fig, axes = plt.subplots(2, 3)
+    ax_col_id = 0
+    ax_row_id = -1
+    for timespan in [hour_timespan_str,min_timespan_str]:
+        ax_row_id +=1
+        for model in [model_type_lstm, model_type_bilstm, model_type_gru]:
+            with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)]
+            without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)]
+            ax = axes[ax_row_id, ax_col_id]
+            ax.set_title(model+' '+timespan)
+            ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str)
+            ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str)
+            ax.legend()
+            ax_col_id +=1
+            ax_col_id %= 3
+    fig.tight_layout()
+    fig.savefig(figure_path+'v1_results.svg')
+    # Fazit: keine eindeutig besseren Versionen erkennbar
+

 if __name__ == "__main__":
-    main_two()
+    # main_two_v1()
+    # visualise_results_v1()
+    main_two_v2(model_type=model_type_gru)
    print('Done')
--- a/pipeline.py
+++ b/pipeline.py
@ -1,7 +1,10 @@
+import keras_tuner
 import numpy as np
 import pandas as pd
 import shutil
 import os
+
+from keras.src.metrics import F1Score
 from pandas import ExcelWriter
 import keras_tuner as kt
 from tensorflow.keras.models import Sequential
@ -11,7 +14,7 @@ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
 from keras_tuner import RandomSearch
 from sklearn.metrics import accuracy_score

-epochs = 50
+epochs = 30
 model_type_gru = 'GRU'
 model_type_lstm = 'LSTM'
 model_type_bilstm = 'BiLSTM'
@ -139,6 +142,58 @@ def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./workin

    return best_models

+# === Training & Validation ===
+def train_models_v2(user_data, user_data_val, sequence_length, model_type):
+    tuner_dir = "./working/tuner"
+
+    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
+    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
+
+    shutil.rmtree(tuner_dir, ignore_errors=True)
+
+    x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
+    x_val, y_val = prepare_data_for_model(user_data=user_data_val, sequence_length=sequence_length)
+
+    n_features = x.shape[2]
+    users = list(user_data.keys())
+
+    def build_model(hp):
+        model = Sequential()
+        if model_type==model_type_bilstm:
+            model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
+                                         input_shape=(sequence_length, n_features))))
+        if model_type==model_type_lstm:
+            model.add(LSTM(units=hp.Int('units', 32, 256, step=2),
+                                         input_shape=(sequence_length, n_features)))
+        if model_type==model_type_gru:
+            model.add(GRU(units=hp.Int('units', 32, 256, step=2),
+                                         input_shape=(sequence_length, n_features)))
+        model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
+        model.add(Dense(len(users), activation='softmax'))
+        model.compile(
+            optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
+            loss='sparse_categorical_crossentropy',
+            metrics=['accuracy']
+        )
+        return model
+
+    tuner = RandomSearch(
+        build_model,
+        objective='val_loss',
+        max_trials=100,
+        directory=tuner_dir,
+    )
+
+    tuner.search(x, y, epochs=epochs, validation_data=(x_val, y_val),
+                 callbacks=[early_stopping, lr_scheduler])
+
+    best_hps = tuner.get_best_hyperparameters(1)[0]
+    best_model = tuner.hypermodel.build(best_hps)
+    best_model.fit(x, y, epochs=epochs, validation_data=(x_val, y_val),
+                   callbacks=[early_stopping, lr_scheduler])
+    return best_model
+
+
 # === Evaluation ===
 def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD):
    print("\n🧪 Evaluating on Test Data...")
--- a/requirements.txt
+++ b/requirements.txt
@ -46,3 +46,5 @@ tzdata==2025.2
 urllib3==2.5.0
 Werkzeug==3.1.3
 wrapt==1.17.2
+
+matplotlib~=3.10.6