From ac57fef0e59d3e0c0e7bf86657cf741f9396d9d5 Mon Sep 17 00:00:00 2001 From: Bianca Steffes Date: Tue, 11 Nov 2025 12:49:52 +0100 Subject: [PATCH] Added method for manual testing of hyperparameters --- main.py | 46 ++++++++++++++++++++++++-- pipeline.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 127 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index 7530053..a71702d 100644 --- a/main.py +++ b/main.py @@ -13,7 +13,7 @@ from pipeline import ( prepare_user_data, train_models, evaluate_models, - prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2 + prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model ) year_str = 'Year' @@ -321,11 +321,53 @@ def test(model_type): ignore_index=True) print(results) +def manual_tuning(model_type): + # load dataset + sequence_length = 20 + data_filename = 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx' + timespan_id = min_timespan_str + threshold_id = with_threshold_str + + file_path = os.path.join(dataset_path, data_filename) + df = load_dataset(file_path) + df = remove_covid_data(df) + + tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=20) + tr = reduce_columns(tr, data_filename) + val = reduce_columns(val, data_filename) + te = reduce_columns(te, data_filename) + + user_data_train = prepare_user_data(tr) + user_data_val = prepare_user_data(val) + + # fit and evaluate model + # config + repeats = 5 + n_batch = 4 + n_epochs = 500 + n_neurons = 1 + + history_list = list() + # run diagnostic tests + for i in range(repeats): + history = train_one_model(user_data_train, user_data_val, n_batch, n_epochs, n_neurons, + sequence_length=sequence_length, + model_type=model_type) + history_list.append(history) + for metric in ['p', 'r', 'f1']: + for history in history_list: + plt.plot(history['train_'+metric], color='blue') + plt.plot(history['test_'+metric], color='orange') + plt.savefig(figure_path+metric+'_epochs_diagnostic.png') + plt.clf() + print('Done') + if __name__ == "__main__": # main_two_v1() # visualise_results_v1() - test(model_type=model_type_gru) + #test(model_type=model_type_gru) # main_two_v2(model_type=model_type_gru) #visualise_results_v2() + manual_tuning(model_type=model_type_lstm) print('Done') diff --git a/pipeline.py b/pipeline.py index 70bfcbe..38c4356 100644 --- a/pipeline.py +++ b/pipeline.py @@ -4,14 +4,15 @@ import pandas as pd import shutil from keras import Input -from keras.src.metrics import F1Score, Precision, Recall, Accuracy -from pandas import ExcelWriter +from keras.src.losses import SparseCategoricalCrossentropy +from keras.src.metrics import F1Score, Precision, Recall, Accuracy, SparseCategoricalAccuracy +from pandas import ExcelWriter, DataFrame from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping from keras_tuner import RandomSearch -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score epochs = 5#50 model_type_gru = 'GRU' @@ -61,18 +62,25 @@ def prepare_user_data(df): users = df['user'].unique() return {user: df[df['user'] == user] for user in users} +def make_sequences(data, sequence_length): + x, y = [], [] + features = data.drop('user', axis=1).values + features = features.astype(int) + labels = data['user'].values + for i in range(len(features) - sequence_length): + x.append(features[i:i + sequence_length]) + y.append(labels[i + sequence_length]) + return x, y + def prepare_data_for_model(user_data, sequence_length): - X, y = [], [] + x, y = [], [] for user, data in user_data.items(): - features = data.drop('user', axis=1).values - features = features.astype(int) - labels = data['user'].values - for i in range(len(features) - sequence_length): - X.append(features[i:i + sequence_length]) - y.append(labels[i + sequence_length]) - X = np.array(X) + x_new, y_new = make_sequences(data, sequence_length) + x = x + x_new + y = y + y_new + x = np.array(x) y = np.array(y) - return X,y + return x,y # === Training & Validation === def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./working/tuner", model_type=model_type_lstm): @@ -197,6 +205,69 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type): return tuner.get_best_models(num_models=1)[0] +def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence_length, model_type): + x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length) + n_features = x.shape[2] + users = list(train_data.keys()) + + # prepare model + def build_model(): + model = Sequential() + model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch)) +# if model_type == model_type_bilstm: + # model.add(Bidirectional(units=units_hp)) + if model_type == model_type_lstm: + model.add(LSTM(n_neurons)) + # if model_type == model_type_gru: + # model.add(GRU(units=units_hp)) + # TODO: add another dense layer + #model.add(Dense(256, activation='relu')) + # model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1))) + model.add(Dense(len(users), activation='softmax')) + model.compile( + optimizer=Adam(learning_rate=1e-5), + loss=SparseCategoricalCrossentropy(), + metrics=[SparseCategoricalAccuracy()], + ) + return model + + model = build_model() + + # fit model + train_p, test_p, train_r, test_r, train_f1, test_f1 = list(), list(),list(), list(),list(), list() + for i in range(n_epochs): + model.fit(x, y, batch_size=n_batch, epochs=1, verbose=0, shuffle=False) + # evaluate model on train data + p, r, f1 = evaluate(model, train_data, sequence_length, n_batch) + train_p.append(p) + train_r.append(r) + train_f1.append(f1) + # evaluate model on test data + p, r, f1 = evaluate(model, val_data, sequence_length, n_batch) + test_p.append(p) + test_r.append(r) + test_f1.append(f1) + + history = DataFrame() + history['train_p'], history['test_p'] = train_p, test_p + history['train_r'], history['test_r'] = train_r, test_r + history['train_f1'], history['test_f1'] = train_f1, test_f1 + return history + + +def evaluate(model, df, sequence_length, batch_size): + x, y = prepare_data_for_model(user_data=df, sequence_length=sequence_length) + x = np.array(x) + y_true = np.array(y) + + y_pred = model.predict(x, verbose=0, batch_size=batch_size) + y_pred_classes = np.argmax(y_pred, axis=1) + f1 = f1_score(y_true=y_true, y_pred=y_pred_classes, average='weighted') + p = precision_score(y_true=y_true, y_pred=y_pred_classes, average='weighted') + r = recall_score(y_true=y_true, y_pred=y_pred_classes, average='weighted') + return p, r, f1 + + # === Evaluation === def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD): print("\n🧪 Evaluating on Test Data...")