Browse Source

Added new evaluation method

master
Bianca Steffes 3 weeks ago
parent
commit
316a7f0343
  1. 1
      .gitignore
  2. BIN
      Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD .xlsx
  3. 89
      main.py
  4. 33
      pipeline.py

1
.gitignore

@ -139,3 +139,4 @@ dmypy.json
cython_debug/
.idea
working/tuner

BIN
Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD .xlsx

89
main.py

@ -1,5 +1,9 @@
import json
import os
import numpy as np
import pandas as pd
import sklearn
from pipeline import (
load_dataset,
@ -9,17 +13,28 @@ from pipeline import (
train_models,
evaluate_models,
display_warning_about_2020_data,
display_warnings_for_scenarios
display_warnings_for_scenarios, prepare_data_for_model
)
year_str = 'Year'
month_str = 'Month'
user_str = 'user'
split_str = 'split type'
threshold_str = 'threshold used'
timespan_str = 'time used'
sequence_length_str = 'sequence length'
precision_str = 'precision'
recall_str = 'recall'
f1_string = 'f1 score'
weak_column_names = ['DayOfWeek_'+day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
# === Configurable Parameters ===
DATA_PATH = './Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
dataset_path = './Datasets/'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
SEQUENCE_LENGTHS = [20] # You can add more: [20, 25, 30]
result_filename = './working/evaluation_results.json'
SEQUENCE_LENGTHS = [20, 15, 10, 5, 1] # You can add more: [20, 25, 30]
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
VALIDATION_SCENARIO = [(2019, [10, 11, 12])]
@ -67,9 +82,6 @@ def main():
# === Load and preprocess ===
df = load_dataset(DATA_PATH)
removed = remove_covid_data(df)
tr,val,te = split_data_by_userdata_percentage(df, (80,10,10))
tr_2, val_2, te_2 = split_data_by_month_percentage(df, (80, 10, 10))
ALLUSERS32_15MIN_WITHOUTTHREHOLD = False
if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH):
@ -90,5 +102,68 @@ def main():
print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}")
def reduce_columns(df, filename):
if '15MIN' in filename:
return df.drop(columns=['Month', 'Year', 'date']+weak_column_names)
else:
return df.drop(columns=['Month', 'Year', 'date'])
def main_two():
results = pd.DataFrame()
if os.path.exists(result_filename):
results = pd.DataFrame(json.load(open(result_filename)))
for sequence_length in SEQUENCE_LENGTHS:
for data_filename in os.listdir(dataset_path):
for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]:
timespan_id = '1HR'
threshold_id = 'WITH'
if '15MIN' in data_filename:
timespan_id = '15MIN'
if 'WITHOUT' in data_filename:
threshold_id = 'WITHOUT'
if len(results) > 0:
if len(results[(results[split_str]==split_id) &
(results[timespan_str]==timespan_id) &
(results[threshold_str]==threshold_id) &
(results[sequence_length_str]==sequence_length)]) > 0:
continue
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
tr,val,te = split_method(df, percentages=(80,10,10))
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length])
results = pd.concat([results,
evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id,
sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id)], ignore_index=True)
results.to_json(result_filename)
# === Evaluation ===
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id):
user_data = prepare_user_data(test_df)
x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
y_pred = model.predict(x, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted')
precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted')
f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted')
return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id],
sequence_length_str:[sequence_length], recall_str:[recall],
precision_str:[precision], f1_string:[f1_score]})
if __name__ == "__main__":
main()
main_two()
print('Done')

33
pipeline.py

@ -54,6 +54,18 @@ def prepare_user_data(df):
users = df_sorted['user'].unique()
return {user: df_sorted[df_sorted['user'] == user] for user in users}
def prepare_data_for_model(user_data, sequence_length):
X, y = [], []
for user, data in user_data.items():
features = data.drop('user', axis=1).values
labels = data['user'].values
for i in range(len(features) - sequence_length):
X.append(features[i:i + sequence_length])
y.append(labels[i + sequence_length])
X = np.array(X)
y = np.array(y)
return X,y
# === Training & Validation ===
def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./working/tuner"):
best_models = {}
@ -65,25 +77,8 @@ def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./w
for sequence_length in sequence_lengths:
print(f"\n=== Training for Sequence Length: {sequence_length} ===")
X, y = [], []
for user, data in user_data.items():
features = data.drop('user', axis=1).values
labels = data['user'].values
for i in range(len(features) - sequence_length):
X.append(features[i:i + sequence_length])
y.append(labels[i + sequence_length])
X = np.array(X)
y = np.array(y)
X_val, y_val = [], []
for user, data in user_data_val.items():
features = data.drop('user', axis=1).values
labels = data['user'].values
for i in range(len(features) - sequence_length):
X_val.append(features[i:i + sequence_length])
y_val.append(labels[i + sequence_length])
X_val = np.array(X_val)
y_val = np.array(y_val)
X, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
X_val, y_val = prepare_data_for_model(user_data=user_data_val, sequence_length=sequence_length)
if X.shape[0] == 0 or X_val.shape[0] == 0:
print(f"⚠️ Skipped sequence length {sequence_length} due to insufficient data.")

Loading…
Cancel
Save