You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
169 lines
7.1 KiB
169 lines
7.1 KiB
import json
|
|
import os
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import sklearn
|
|
|
|
from pipeline import (
|
|
load_dataset,
|
|
filter_data,
|
|
filter_test_data,
|
|
prepare_user_data,
|
|
train_models,
|
|
evaluate_models,
|
|
display_warning_about_2020_data,
|
|
display_warnings_for_scenarios, prepare_data_for_model
|
|
)
|
|
|
|
year_str = 'Year'
|
|
month_str = 'Month'
|
|
user_str = 'user'
|
|
split_str = 'split type'
|
|
threshold_str = 'threshold used'
|
|
timespan_str = 'time used'
|
|
sequence_length_str = 'sequence length'
|
|
precision_str = 'precision'
|
|
recall_str = 'recall'
|
|
f1_string = 'f1 score'
|
|
weak_column_names = ['DayOfWeek_'+day for day in
|
|
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
|
|
|
|
# === Configurable Parameters ===
|
|
dataset_path = './Datasets/'
|
|
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
|
|
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
|
|
result_filename = './working/evaluation_results.json'
|
|
SEQUENCE_LENGTHS = [20, 15, 10, 5, 1] # You can add more: [20, 25, 30]
|
|
|
|
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
|
|
VALIDATION_SCENARIO = [(2019, [10, 11, 12])]
|
|
TEST_SCENARIO = [(2020, [1, 2])] # Jan–Feb 2020 only
|
|
|
|
# === Optional display only ===
|
|
predefined_training_scenarios = {
|
|
"Scenario 1": {"years_months": [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]},
|
|
"Scenario 2": {"years_months": [(2017, list(range(1, 13))), (2018, list(range(1, 13))), (2019, list(range(1, 10)))]}
|
|
}
|
|
predefined_validation_scenarios = {
|
|
"Scenario A": {"years_months": [(2019, [10, 11, 12])]}
|
|
}
|
|
|
|
def remove_covid_data(df):
|
|
df = df[~((df[year_str]==2020) & (df[month_str]>2))]
|
|
return df
|
|
|
|
def split_data_by_month_percentage(df, percentages):
|
|
train_p, valid_p, test_p = percentages
|
|
ids = df[[year_str, month_str]].drop_duplicates().sort_values([year_str, month_str])
|
|
tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
|
|
return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')
|
|
|
|
def split_data_by_userdata_percentage(df, percentages):
|
|
train_p, valid_p, test_p = percentages
|
|
tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
for user_id in df[user_str].unique():
|
|
user_data = df[df[user_str]==user_id].sort_values([year_str, month_str])
|
|
u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))])
|
|
tr = pd.concat([tr, u_tr], ignore_index=True)
|
|
va = pd.concat([va, u_va], ignore_index=True)
|
|
te = pd.concat([te, u_te], ignore_index=True)
|
|
return tr, va, te
|
|
|
|
|
|
def main():
|
|
# print("=== Training Scenario Setup ===")
|
|
# display_warning_about_2020_data()
|
|
# display_warnings_for_scenarios("training", predefined_training_scenarios, predefined_validation_scenarios)
|
|
|
|
# print("\n=== Validation Scenario Setup ===")
|
|
# display_warning_about_2020_data()
|
|
# display_warnings_for_scenarios("validation", predefined_training_scenarios, predefined_validation_scenarios)
|
|
|
|
# === Load and preprocess ===
|
|
df = load_dataset(DATA_PATH)
|
|
|
|
ALLUSERS32_15MIN_WITHOUTTHREHOLD = False
|
|
if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH):
|
|
ALLUSERS32_15MIN_WITHOUTTHREHOLD = True
|
|
|
|
training_data = filter_data(df, TRAINING_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
|
|
validation_data = filter_data(df, VALIDATION_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
|
|
|
|
user_data_train = prepare_user_data(training_data)
|
|
user_data_val = prepare_user_data(validation_data)
|
|
|
|
# === Train models ===
|
|
best_models = train_models(user_data_train, user_data_val, sequence_lengths=SEQUENCE_LENGTHS)
|
|
|
|
# === Load and evaluate test ===
|
|
test_df = filter_test_data(df, TEST_SCENARIO)
|
|
evaluate_models(best_models, test_df, SEQUENCE_LENGTHS, OUTPUT_EXCEL_PATH, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
|
|
|
|
print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}")
|
|
|
|
|
|
def reduce_columns(df, filename):
|
|
if '15MIN' in filename:
|
|
return df.drop(columns=['Month', 'Year', 'date']+weak_column_names)
|
|
else:
|
|
return df.drop(columns=['Month', 'Year', 'date'])
|
|
|
|
def main_two():
|
|
results = pd.DataFrame()
|
|
if os.path.exists(result_filename):
|
|
results = pd.DataFrame(json.load(open(result_filename)))
|
|
for sequence_length in SEQUENCE_LENGTHS:
|
|
for data_filename in os.listdir(dataset_path):
|
|
for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]:
|
|
timespan_id = '1HR'
|
|
threshold_id = 'WITH'
|
|
if '15MIN' in data_filename:
|
|
timespan_id = '15MIN'
|
|
if 'WITHOUT' in data_filename:
|
|
threshold_id = 'WITHOUT'
|
|
if len(results) > 0:
|
|
if len(results[(results[split_str]==split_id) &
|
|
(results[timespan_str]==timespan_id) &
|
|
(results[threshold_str]==threshold_id) &
|
|
(results[sequence_length_str]==sequence_length)]) > 0:
|
|
continue
|
|
|
|
file_path = os.path.join(dataset_path, data_filename)
|
|
df = load_dataset(file_path)
|
|
df = remove_covid_data(df)
|
|
tr,val,te = split_method(df, percentages=(80,10,10))
|
|
tr = reduce_columns(tr, data_filename)
|
|
val = reduce_columns(val, data_filename)
|
|
te = reduce_columns(te, data_filename)
|
|
|
|
user_data_train = prepare_user_data(tr)
|
|
user_data_val = prepare_user_data(val)
|
|
|
|
best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length])
|
|
|
|
results = pd.concat([results,
|
|
evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id,
|
|
sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id)], ignore_index=True)
|
|
results.to_json(result_filename)
|
|
|
|
|
|
# === Evaluation ===
|
|
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id):
|
|
user_data = prepare_user_data(test_df)
|
|
x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
|
|
|
|
y_pred = model.predict(x, verbose=0)
|
|
y_pred_classes = np.argmax(y_pred, axis=1)
|
|
|
|
recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted')
|
|
precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted')
|
|
f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted')
|
|
return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id],
|
|
sequence_length_str:[sequence_length], recall_str:[recall],
|
|
precision_str:[precision], f1_string:[f1_score]})
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main_two()
|
|
print('Done')
|