From 40b32c30d365c4e51aa09388c683fa0af20ae402 Mon Sep 17 00:00:00 2001
From: Bianca Steffes <bianca.steffes@uni-saarland.de>
Date: Tue, 18 Nov 2025 13:11:48 +0100
Subject: [PATCH] Added Baselines + extended manual evaluation

---
 .gitignore  |  1 +
 main.py     | 57 ++++++++++++++++++++++++++++++++++++++++++++++-------
 pipeline.py | 32 ++++++++++++++++++++----------
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index fca8e1c..5997bd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -142,3 +142,4 @@ cython_debug/
 working/tuner
 working
 figures
+baseline_results.json
diff --git a/main.py b/main.py
index a71702d..e4f23b1 100644
--- a/main.py
+++ b/main.py
@@ -4,7 +4,10 @@ import os
 import numpy as np
 import pandas as pd
 import sklearn
+from keras.src.regularizers import L1L2
 from matplotlib import pyplot as plt
+from pandas import DataFrame
+from sklearn.dummy import DummyClassifier
 
 from pipeline import (
     load_dataset,
@@ -13,7 +16,8 @@ from pipeline import (
     prepare_user_data,
     train_models,
     evaluate_models,
-    prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model
+    prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model,
+    eval_metrics
 )
 
 year_str = 'Year'
@@ -29,6 +33,7 @@ timespan_str = 'time used'
 hour_timespan_str = '1HR'
 min_timespan_str = '15MIN'
 sequence_length_str = 'sequence length'
+accuracy_str = 'accuracy'
 precision_str = 'precision'
 recall_str = 'recall'
 f1_string = 'f1 score'
@@ -332,7 +337,7 @@ def manual_tuning(model_type):
     df = load_dataset(file_path)
     df = remove_covid_data(df)
 
-    tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=20)
+    tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=100)
     tr = reduce_columns(tr, data_filename)
     val = reduce_columns(val, data_filename)
     te = reduce_columns(te, data_filename)
@@ -342,15 +347,18 @@ def manual_tuning(model_type):
 
     # fit and evaluate model
     # config
-    repeats = 5
-    n_batch = 4
+    repeats = 3
+    n_batch = 1024
     n_epochs = 500
-    n_neurons = 1
+    n_neurons = 16
+    l_rate = 1e-4
+    reg = L1L2(l1=0.0, l2=0.0)
 
     history_list = list()
     # run diagnostic tests
     for i in range(repeats):
-        history = train_one_model(user_data_train, user_data_val, n_batch, n_epochs, n_neurons,
+        history = train_one_model(user_data_train, user_data_val, n_batch, n_epochs,
+                                  n_neurons, l_rate, reg,
                                      sequence_length=sequence_length,
                                      model_type=model_type)
         history_list.append(history)
@@ -358,11 +366,45 @@ def manual_tuning(model_type):
         for history in history_list:
             plt.plot(history['train_'+metric], color='blue')
             plt.plot(history['test_'+metric], color='orange')
-        plt.savefig(figure_path+metric+'_epochs_diagnostic.png')
+        plt.savefig(figure_path+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
+                    str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
         plt.clf()
     print('Done')
 
 
+
+def calculate_baselines():
+    file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
+                         (min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'),
+                         (min_timespan_str, without_threshold_str, 'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'),
+                         (hour_timespan_str, without_threshold_str, 'ALLUSERS_32_1HR_WITHOUT_THRESHOLD.xlsx'),
+                         ]
+    baseline_res = pd.DataFrame()
+    for timespan_id, threshold_id, filename in file_combinations:
+        file_path = os.path.join(dataset_path, filename)
+        df = load_dataset(file_path)
+        df = remove_covid_data(df)
+
+        _, _, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=20)
+        te = reduce_columns(te, filename)
+        user_data_te = prepare_user_data(te)
+        for sequence_length in range(5,30, 5):
+            x, y = prepare_data_for_model(user_data=user_data_te, sequence_length=sequence_length)
+
+            for strategy in ['most_frequent', 'stratified', 'uniform']:
+                cls = DummyClassifier(strategy=strategy)
+                cls.fit(x,y)
+                y_pred = cls.predict(x)
+                acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred)
+                baseline_res = pd.concat([baseline_res,
+                                          DataFrame({ 'strategy':[strategy], threshold_str:[threshold_id],
+                                                      timespan_str:[timespan_id], sequence_length_str:[sequence_length],
+                                              accuracy_str:[acc],precision_str:[p],recall_str:[r],
+                                                     f1_string:f1})], ignore_index=True)
+    baseline_res.to_json('baseline_results.json')
+    print('Done')
+
+
 if __name__ == "__main__":
     # main_two_v1()
     # visualise_results_v1()
@@ -370,4 +412,5 @@ if __name__ == "__main__":
    # main_two_v2(model_type=model_type_gru)
     #visualise_results_v2()
     manual_tuning(model_type=model_type_lstm)
+    #calculate_baselines()
     print('Done')
diff --git a/pipeline.py b/pipeline.py
index 38c4356..1e875ad 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -1,3 +1,5 @@
+import random
+
 import keras_tuner
 import numpy as np
 import pandas as pd
@@ -78,6 +80,8 @@ def prepare_data_for_model(user_data, sequence_length):
         x_new, y_new = make_sequences(data, sequence_length)
         x = x + x_new
         y = y + y_new
+    random.Random(17).shuffle(x)
+    random.Random(17).shuffle(y)
     x = np.array(x)
     y = np.array(y)
     return x,y
@@ -205,7 +209,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
     return tuner.get_best_models(num_models=1)[0]
 
 
-def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence_length, model_type):
+def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length, model_type):
     x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
     n_features = x.shape[2]
     users = list(train_data.keys())
@@ -213,7 +217,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence
     # prepare model
     def build_model():
         model = Sequential()
-        model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
+        model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch, bias_regularizer=reg))
 #        if model_type == model_type_bilstm:
  #           model.add(Bidirectional(units=units_hp))
         if model_type == model_type_lstm:
@@ -225,7 +229,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence
        # model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1)))
         model.add(Dense(len(users), activation='softmax'))
         model.compile(
-            optimizer=Adam(learning_rate=1e-5),
+            optimizer=Adam(learning_rate=l_rate),
             loss=SparseCategoricalCrossentropy(),
             metrics=[SparseCategoricalAccuracy()],
         )
@@ -234,21 +238,24 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence
     model = build_model()
 
     # fit model
-    train_p, test_p, train_r, test_r, train_f1, test_f1 = list(), list(),list(), list(),list(), list()
+    train_acc, test_acc, train_p, test_p, train_r, test_r, train_f1, test_f1 = list(), list(),list(), list(),list(), list(),list(), list()
     for i in range(n_epochs):
         model.fit(x, y, batch_size=n_batch, epochs=1, verbose=0, shuffle=False)
         # evaluate model on train data
-        p, r, f1 = evaluate(model, train_data, sequence_length, n_batch)
+        acc, p, r, f1 = evaluate(model, train_data, sequence_length, n_batch)
+        train_acc.append(acc)
         train_p.append(p)
         train_r.append(r)
         train_f1.append(f1)
         # evaluate model on test data
-        p, r, f1 = evaluate(model, val_data, sequence_length, n_batch)
+        acc, p, r, f1 = evaluate(model, val_data, sequence_length, n_batch)
+        test_acc.append(acc)
         test_p.append(p)
         test_r.append(r)
         test_f1.append(f1)
 
     history = DataFrame()
+    history['train_acc'], history['test_acc'] = train_acc, test_acc
     history['train_p'], history['test_p'] = train_p, test_p
     history['train_r'], history['test_r'] = train_r, test_r
     history['train_f1'], history['test_f1'] = train_f1, test_f1
@@ -262,11 +269,16 @@ def evaluate(model, df, sequence_length, batch_size):
 
     y_pred = model.predict(x, verbose=0, batch_size=batch_size)
     y_pred_classes = np.argmax(y_pred, axis=1)
-    f1 = f1_score(y_true=y_true, y_pred=y_pred_classes, average='weighted')
-    p = precision_score(y_true=y_true, y_pred=y_pred_classes, average='weighted')
-    r = recall_score(y_true=y_true, y_pred=y_pred_classes, average='weighted')
-    return p, r, f1
+    return eval_metrics(y_true=y_true, y_pred=y_pred_classes)
+
+
 
+def eval_metrics(y_true, y_pred):
+    f1 = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
+    p = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
+    r = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
+    acc = accuracy_score(y_true=y_true, y_pred=y_pred)
+    return acc, p, r, f1
 
 # === Evaluation ===
 def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD):