From 98831ce4f34c528fe5c5d4687d8cd24e9ae45907 Mon Sep 17 00:00:00 2001 From: Bianca Steffes Date: Wed, 21 Jan 2026 09:20:43 +0100 Subject: [PATCH] Updated code for different tests --- environment.yaml | 1 + main.py | 92 ++++++++++++++++++++++++++++++++++++++++-------- pipeline.py | 46 +++++++++++++++++------- 3 files changed, 112 insertions(+), 27 deletions(-) diff --git a/environment.yaml b/environment.yaml index 263a2b0..44d8b01 100644 --- a/environment.yaml +++ b/environment.yaml @@ -27,6 +27,7 @@ dependencies: - Markdown==3.8.2 - markdown-it-py==3.0.0 - MarkupSafe==3.0.2 + - matplotlib - mdurl==0.1.2 - ml_dtypes==0.5.1 - namex==0.1.0 diff --git a/main.py b/main.py index 7041039..6d7dc4b 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import json import os +import math import numpy as np import pandas as pd import sklearn @@ -18,7 +19,7 @@ from pipeline import ( train_models, evaluate_models, prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model, - eval_metrics + eval_metrics, get_save_id ) year_str = 'Year' @@ -45,6 +46,7 @@ model_type_str = 'model type' week_column_names = ['DayOfWeek_' + day for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] figure_path = 'figures/' +predicitons_path = 'preds/' # === Configurable Parameters === dataset_path = './Datasets/' @@ -69,8 +71,17 @@ predefined_validation_scenarios = { "Scenario A": {"years_months": [(2019, [10, 11, 12])]} } +def create_dir(path): + """ + Creates a directory if it doesn't exist yet. + + :param path: The path to the directory + """ + if not os.path.exists(path): + os.makedirs(path) + def remove_covid_data(df): - df = df[~((df[year_str]==2020) & (df[month_str]>2))] + df = df[~(df[year_str]>=2020)] return df def split_data_by_month_percentage(df, percentages): @@ -381,9 +392,24 @@ def manual_tuning(model_type): print('Done') +def upsampling(df): + max_user_data = df[user_str].value_counts().max() + for user in df[user_str].unique(): + user_data = df[df[user_str]==user] + user_count = user_data.shape[0] + times = max_user_data / user_count + before_comma = math.floor(times) + after_comma = times % 1 + after_comma_data = user_data.sample(frac=after_comma) + for i in range(1, before_comma): + df = pd.concat([df, user_data], ignore_index=True) + df = pd.concat([df, after_comma_data], ignore_index=True) + return df + + def manual_tuning_v3(model_type): # TODO: hrs/min + different sequence lengths - sequence_length = 20 + sequence_length = 7 tr, val, te = get_prepared_data_v3(dataset_hrs_path) @@ -391,28 +417,37 @@ def manual_tuning_v3(model_type): # config repeats = 3 n_batch = 1024 - n_epochs = 500 - n_neurons = 16 - l_rate = 1e-4 + n_epochs = 200 + n_neurons = 256 + n_neurons2 = 512 + n_neurons3 = 512 + n_neurons4 = 128 + l_rate = 1e-2 + d1 = 256 + reg1 = L1L2(l1=0.0, l2=0.001) + r1 = '0001' + reg2 = L1L2(l1=0.0, l2=0.1) + r2 = '01' history_list = list() # run diagnostic tests for i in range(repeats): history = train_one_model(tr, val, n_batch, n_epochs, - n_neurons, l_rate, + n_neurons,n_neurons2, n_neurons3, n_neurons4, l_rate, d1, r1, reg1, r2, reg2, sequence_length=sequence_length, model_type=model_type) history_list.append(history) - for metric in ['p', 'r', 'f1']: + for metric in ['acc', 'p', 'r', 'f1']: for history in history_list: plt.plot(history['train_'+metric], color='blue') plt.plot(history['test_'+metric], color='orange') - plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+ - str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png') + plt.savefig(figure_path+'v3/'+metric+get_save_id(n_epochs, n_neurons, n_neurons2, n_neurons3,n_neurons4, n_batch, l_rate, d1, r1, r2) + +'.png') plt.clf() print('Done') + def calculate_baselines(): file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'), (min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'), @@ -448,23 +483,51 @@ def get_prepared_data_v3(filename, sample=100): df = pd.read_json(filename) df = remove_covid_data(df) - tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample) + # remove users with too little data + value_counts = df[user_str].value_counts() + df = df[df[user_str].isin(value_counts[value_counts>1000].index)] + + adjusted_df = pd.DataFrame() + # adjust labels + new_id = 0 + for user_id in df[user_str].unique(): + user_data = df[df[user_str]==user_id] + user_data[user_str] = new_id + adjusted_df = pd.concat([adjusted_df, user_data], ignore_index=True) + new_id += 1 + + # bin steps per hour TODO: adjust for minutes + for hour in ['Hour_'+str(i) for i in range(24)]: + hour_data = adjusted_df[hour] + # smaller 1000 - round to 10 + a = ((hour_data[hour_data<1000]/10).round()*10) + # between 1000 and 10000 - round to next 100 + b = ((hour_data[(hour_data>=1000)& (hour_data<10000)]/100).round()*100) + # higher or equal 10000 - one class + c = hour_data[hour_data > 10000] + c = pd.Series(data={ind:10000 for ind in c.index}, index=c.index) + new = pd.concat([a, b, c]).sort_index().astype(int) + adjusted_df[hour] = new + + tr, val, te = split_data_by_userdata_percentage(adjusted_df, percentages=(70, 15, 15), sample=sample) tr = reduce_columns_v3(tr) val = reduce_columns_v3(val) te = reduce_columns_v3(te) + scaler = MinMaxScaler() scaler.fit(tr.drop(columns=[user_str])) return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te) + def scale_dataset(scaler, df): y = df[user_str] x_scaled = scaler.transform(df.drop(columns=[user_str])) df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1) df_scaled.columns = df.columns - return prepare_user_data(df) + return prepare_user_data(df_scaled) def calculate_baselines_v3(): @@ -498,5 +561,6 @@ if __name__ == "__main__": #visualise_results_v2() #manual_tuning(model_type=model_type_lstm) #calculate_baselines() - calculate_baselines_v3() - print('Done') + #calculate_baselines_v3() + manual_tuning_v3(model_type=model_type_lstm) + print('Done') # TODO: unterschiedlich große Datenmengen als ein Problem (auch in der Evaluation) diff --git a/pipeline.py b/pipeline.py index a8affae..bcd136a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -14,13 +14,14 @@ from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping from keras_tuner import RandomSearch -from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix epochs = 5#50 model_type_gru = 'GRU' model_type_lstm = 'LSTM' model_type_bilstm = 'BiLSTM' + # === Display functions === def display_warning_about_2020_data(): print("\n⚠️ Warning: 2020 data after February is excluded due to COVID-19.") @@ -67,11 +68,11 @@ def prepare_user_data(df): def make_sequences(data, sequence_length): x, y = [], [] features = data.drop('user', axis=1).values - features = features.astype(int) + #features = features.astype(int) labels = data['user'].values - for i in range(len(features) - sequence_length): + for i in range(len(features) - sequence_length+1): x.append(features[i:i + sequence_length]) - y.append(labels[i + sequence_length]) + y.append(labels[i + sequence_length-1]) return x, y def prepare_data_for_model(user_data, sequence_length): @@ -209,7 +210,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type): return tuner.get_best_models(num_models=1)[0] -def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, sequence_length, model_type): +def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons,n_neurons2,n_neurons3,n_neurons4, l_rate, d1, r1, reg1, r2, reg2, sequence_length, model_type): x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length) n_features = x.shape[2] users = list(train_data.keys()) @@ -218,15 +219,19 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, def build_model(): model = Sequential() model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch)) -# if model_type == model_type_bilstm: - # model.add(Bidirectional(units=units_hp)) + if model_type == model_type_bilstm: + model.add(Bidirectional(LSTM(n_neurons))) if model_type == model_type_lstm: +# model.add(LSTM(n_neurons, kernel_regularizer=reg1, return_sequences=True)) model.add(LSTM(n_neurons)) - # if model_type == model_type_gru: - # model.add(GRU(units=units_hp)) + # model.add(LSTM(n_neurons2)) + # model.add(LSTM(n_neurons3, return_sequences=True)) + # model.add(LSTM(n_neurons4)) + if model_type == model_type_gru: + model.add(GRU(n_neurons)) # TODO: add another dense layer - #model.add(Dense(256, activation='relu')) - # model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1))) + #model.add(Dense(n_neurons, activation='relu')) + #model.add(Dropout(d1)) model.add(Dense(len(users), activation='softmax')) model.compile( optimizer=Adam(learning_rate=l_rate), @@ -248,7 +253,8 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, train_r.append(r) train_f1.append(f1) # evaluate model on test data - acc, p, r, f1 = evaluate(model, val_data, sequence_length, n_batch) + savename = 'cf_matrix_'+get_save_id(n_epochs, n_neurons, n_neurons2,n_neurons3, n_neurons4, n_batch, l_rate,d1,r1, r2)+'.json' + acc, p, r, f1 = evaluate(model, val_data, sequence_length, n_batch, save_name=savename) test_acc.append(acc) test_p.append(p) test_r.append(r) @@ -262,13 +268,27 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, return history -def evaluate(model, df, sequence_length, batch_size): +def get_save_id(n_epochs, n_neurons, n_neurons2,n_neurons3,n_neurons4, n_batch, l_rate, d1,r1, r2): + return '_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+ str(n_batch) +#'x'+str(n_neurons3)+'x'+str(n_neurons4) + #+'_l'+str(l_rate)+'_r'+str(r1)+'xx'+str(r2) + + +def evaluate(model, df, sequence_length, batch_size, save_name=None): x, y = prepare_data_for_model(user_data=df, sequence_length=sequence_length) x = np.array(x) y_true = np.array(y) y_pred = model.predict(x, verbose=0, batch_size=batch_size) y_pred_classes = np.argmax(y_pred, axis=1) + cf_matrix = pd.DataFrame(confusion_matrix(y_true, y_pred_classes)) + if save_name is not None: + cf_matrix.to_json('results/'+save_name) + true_counts = pd.DataFrame(y).value_counts() + print('Top true occurrences', true_counts[:6]) + predicted_counts = pd.DataFrame(y_pred_classes).value_counts() + print('Top predicted occurrences', predicted_counts[:6]) + return eval_metrics(y_true=y_true, y_pred=y_pred_classes)