From f20b85216125a13c466c291b258e6033d381f314 Mon Sep 17 00:00:00 2001 From: Bianca Steffes Date: Tue, 28 Oct 2025 10:56:22 +0100 Subject: [PATCH] Added more tests --- .gitignore | 1 + figures/v1_results.svg | 1877 ---------------------------------------- main.py | 44 +- pipeline.py | 34 +- 4 files changed, 62 insertions(+), 1894 deletions(-) delete mode 100644 figures/v1_results.svg diff --git a/.gitignore b/.gitignore index aff3da2..fca8e1c 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ cython_debug/ .idea working/tuner working +figures diff --git a/figures/v1_results.svg b/figures/v1_results.svg deleted file mode 100644 index ecd3870..0000000 --- a/figures/v1_results.svg +++ /dev/null @@ -1,1877 +0,0 @@ - - - - - - - - 2025-09-12T09:20:26.818907 - image/svg+xml - - - Matplotlib v3.10.6, https://matplotlib.org/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/main.py b/main.py index a5579bc..7530053 100644 --- a/main.py +++ b/main.py @@ -68,11 +68,11 @@ def split_data_by_month_percentage(df, percentages): tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))]) return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner') -def split_data_by_userdata_percentage(df, percentages): +def split_data_by_userdata_percentage(df, percentages, sample): train_p, valid_p, test_p = percentages tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() for user_id in df[user_str].unique(): - user_data = df[df[user_str]==user_id].sort_values([year_str, month_str]) + user_data = df[df[user_str]==user_id].sample(frac=sample/ 100).sort_values([year_str, month_str]) u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))]) tr = pd.concat([tr, u_tr], ignore_index=True) va = pd.concat([va, u_va], ignore_index=True) @@ -285,9 +285,47 @@ def visualise_results_v2(): # Fazit: keine eindeutig besseren Versionen erkennbar +def test(model_type): + sequence_length = 20 + data_filename = os.listdir(dataset_path)[0] + timespan_id = hour_timespan_str + threshold_id = with_threshold_str + + file_path = os.path.join(dataset_path, data_filename) + df = load_dataset(file_path) + df = remove_covid_data(df) + results = pd.DataFrame() + + for percentage in [33,66,100]: + print('Percentage:', percentage) + tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10),sample=percentage) + tr = reduce_columns(tr, data_filename) + val = reduce_columns(val, data_filename) + te = reduce_columns(te, data_filename) + + user_data_train = prepare_user_data(tr) + user_data_val = prepare_user_data(val) + + best_model = train_models_v2(user_data_train, user_data_val, + sequence_length=sequence_length, + model_type=model_type) + + results = pd.concat([results, + evaluate_model_on_test_data(model=best_model, + test_df=te, + sequence_length=sequence_length, + time_span_id=timespan_id, + threshold_id=threshold_id, + model_type=model_type, + split_id=data_split_str)], + ignore_index=True) + print(results) + + if __name__ == "__main__": # main_two_v1() # visualise_results_v1() - main_two_v2(model_type=model_type_gru) + test(model_type=model_type_gru) + # main_two_v2(model_type=model_type_gru) #visualise_results_v2() print('Done') diff --git a/pipeline.py b/pipeline.py index f2997c5..70bfcbe 100644 --- a/pipeline.py +++ b/pipeline.py @@ -4,7 +4,7 @@ import pandas as pd import shutil from keras import Input -from keras.src.metrics import F1Score +from keras.src.metrics import F1Score, Precision, Recall, Accuracy from pandas import ExcelWriter from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU @@ -13,7 +13,7 @@ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping from keras_tuner import RandomSearch from sklearn.metrics import accuracy_score -epochs = 50 +epochs = 5#50 model_type_gru = 'GRU' model_type_lstm = 'LSTM' model_type_bilstm = 'BiLSTM' @@ -57,14 +57,15 @@ def filter_test_data(df, scenario): return pd.concat(data_parts, ignore_index=True) def prepare_user_data(df): - df_sorted = df.sort_values(by='user').reset_index(drop=True) - users = df_sorted['user'].unique() - return {user: df_sorted[df_sorted['user'] == user] for user in users} + #df_sorted = df.sort_values(by='user').reset_index(drop=True) + users = df['user'].unique() + return {user: df[df['user'] == user] for user in users} def prepare_data_for_model(user_data, sequence_length): X, y = [], [] for user, data in user_data.items(): features = data.drop('user', axis=1).values + features = features.astype(int) labels = data['user'].values for i in range(len(features) - sequence_length): X.append(features[i:i + sequence_length]) @@ -144,9 +145,11 @@ def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./workin # === Training & Validation === def train_models_v2(user_data, user_data_val, sequence_length, model_type): tuner_dir = "./working/tuner/"+model_type + #val_metric = 'val_f1' + val_metric = 'val_precision' - early_stopping = EarlyStopping(monitor='val_f1', patience=3, restore_best_weights=True) - lr_scheduler = ReduceLROnPlateau(monitor='val_f1', factor=0.5, patience=2) + early_stopping = EarlyStopping(monitor=val_metric, patience=3, restore_best_weights=True) + lr_scheduler = ReduceLROnPlateau(monitor=val_metric, factor=0.5, patience=2) shutil.rmtree(tuner_dir, ignore_errors=True) @@ -156,11 +159,12 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type): n_features = x.shape[2] users = list(user_data.keys()) - y_val = np.array(y_val).reshape(-1, 1) - y = np.array(y).reshape(-1, 1) + #y_val = np.array(y_val).reshape(-1, 1) + #y = np.array(y).reshape(-1, 1) def build_model(hp): - units_hp = hp.Int('units', 2, 256, step=2, sampling="log") + units_hp = hp.Int('units', 2, 8, step=2, sampling="log") +# units_hp = hp.Int('units', 2, 256, step=2, sampling="log") model = Sequential() model.add(Input((sequence_length, n_features))) @@ -170,18 +174,20 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type): model.add(LSTM(units=units_hp)) if model_type==model_type_gru: model.add(GRU(units=units_hp)) - model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1))) + model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1))) model.add(Dense(len(users), activation='softmax')) model.compile( - optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])), + optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-5])), loss='sparse_categorical_crossentropy', - metrics=[F1Score(name='f1', average='weighted')] + metrics=[#F1Score(name='f1', average='weighted'), + Precision(), #Recall(), Accuracy() + ] ) return model tuner = RandomSearch( build_model, - objective=keras_tuner.Objective("val_f1", direction="max"), + objective=keras_tuner.Objective(val_metric, direction="max"), max_trials=120, directory=tuner_dir, )