diff --git a/.gitignore b/.gitignore index aa68a72..edf97dc 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,4 @@ dmypy.json cython_debug/ .idea +working/tuner diff --git a/Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD .xlsx b/Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD .xlsx deleted file mode 100644 index 8e4d9ed..0000000 Binary files a/Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD .xlsx and /dev/null differ diff --git a/main.py b/main.py index 78c90e3..6e3e8e1 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,9 @@ +import json +import os + import numpy as np import pandas as pd +import sklearn from pipeline import ( load_dataset, @@ -9,17 +13,28 @@ from pipeline import ( train_models, evaluate_models, display_warning_about_2020_data, - display_warnings_for_scenarios + display_warnings_for_scenarios, prepare_data_for_model ) year_str = 'Year' month_str = 'Month' user_str = 'user' +split_str = 'split type' +threshold_str = 'threshold used' +timespan_str = 'time used' +sequence_length_str = 'sequence length' +precision_str = 'precision' +recall_str = 'recall' +f1_string = 'f1 score' +weak_column_names = ['DayOfWeek_'+day for day in + ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] # === Configurable Parameters === -DATA_PATH = './Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' +dataset_path = './Datasets/' +DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' -SEQUENCE_LENGTHS = [20] # You can add more: [20, 25, 30] +result_filename = './working/evaluation_results.json' +SEQUENCE_LENGTHS = [20, 15, 10, 5, 1] # You can add more: [20, 25, 30] TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] VALIDATION_SCENARIO = [(2019, [10, 11, 12])] @@ -67,9 +82,6 @@ def main(): # === Load and preprocess === df = load_dataset(DATA_PATH) - removed = remove_covid_data(df) - tr,val,te = split_data_by_userdata_percentage(df, (80,10,10)) - tr_2, val_2, te_2 = split_data_by_month_percentage(df, (80, 10, 10)) ALLUSERS32_15MIN_WITHOUTTHREHOLD = False if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH): @@ -90,5 +102,68 @@ def main(): print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}") + +def reduce_columns(df, filename): + if '15MIN' in filename: + return df.drop(columns=['Month', 'Year', 'date']+weak_column_names) + else: + return df.drop(columns=['Month', 'Year', 'date']) + +def main_two(): + results = pd.DataFrame() + if os.path.exists(result_filename): + results = pd.DataFrame(json.load(open(result_filename))) + for sequence_length in SEQUENCE_LENGTHS: + for data_filename in os.listdir(dataset_path): + for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]: + timespan_id = '1HR' + threshold_id = 'WITH' + if '15MIN' in data_filename: + timespan_id = '15MIN' + if 'WITHOUT' in data_filename: + threshold_id = 'WITHOUT' + if len(results) > 0: + if len(results[(results[split_str]==split_id) & + (results[timespan_str]==timespan_id) & + (results[threshold_str]==threshold_id) & + (results[sequence_length_str]==sequence_length)]) > 0: + continue + + file_path = os.path.join(dataset_path, data_filename) + df = load_dataset(file_path) + df = remove_covid_data(df) + tr,val,te = split_method(df, percentages=(80,10,10)) + tr = reduce_columns(tr, data_filename) + val = reduce_columns(val, data_filename) + te = reduce_columns(te, data_filename) + + user_data_train = prepare_user_data(tr) + user_data_val = prepare_user_data(val) + + best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length]) + + results = pd.concat([results, + evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id, + sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id)], ignore_index=True) + results.to_json(result_filename) + + +# === Evaluation === +def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id): + user_data = prepare_user_data(test_df) + x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length) + + y_pred = model.predict(x, verbose=0) + y_pred_classes = np.argmax(y_pred, axis=1) + + recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted') + precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted') + f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted') + return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id], + sequence_length_str:[sequence_length], recall_str:[recall], + precision_str:[precision], f1_string:[f1_score]}) + + if __name__ == "__main__": - main() + main_two() + print('Done') diff --git a/pipeline.py b/pipeline.py index e2c414f..dd430f1 100644 --- a/pipeline.py +++ b/pipeline.py @@ -54,6 +54,18 @@ def prepare_user_data(df): users = df_sorted['user'].unique() return {user: df_sorted[df_sorted['user'] == user] for user in users} +def prepare_data_for_model(user_data, sequence_length): + X, y = [], [] + for user, data in user_data.items(): + features = data.drop('user', axis=1).values + labels = data['user'].values + for i in range(len(features) - sequence_length): + X.append(features[i:i + sequence_length]) + y.append(labels[i + sequence_length]) + X = np.array(X) + y = np.array(y) + return X,y + # === Training & Validation === def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./working/tuner"): best_models = {} @@ -65,25 +77,8 @@ def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./w for sequence_length in sequence_lengths: print(f"\n=== Training for Sequence Length: {sequence_length} ===") - X, y = [], [] - for user, data in user_data.items(): - features = data.drop('user', axis=1).values - labels = data['user'].values - for i in range(len(features) - sequence_length): - X.append(features[i:i + sequence_length]) - y.append(labels[i + sequence_length]) - X = np.array(X) - y = np.array(y) - - X_val, y_val = [], [] - for user, data in user_data_val.items(): - features = data.drop('user', axis=1).values - labels = data['user'].values - for i in range(len(features) - sequence_length): - X_val.append(features[i:i + sequence_length]) - y_val.append(labels[i + sequence_length]) - X_val = np.array(X_val) - y_val = np.array(y_val) + X, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length) + X_val, y_val = prepare_data_for_model(user_data=user_data_val, sequence_length=sequence_length) if X.shape[0] == 0 or X_val.shape[0] == 0: print(f"⚠️ Skipped sequence length {sequence_length} due to insufficient data.")