You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

266 lines
12 KiB

import json
import os
import numpy as np
import pandas as pd
import sklearn
from matplotlib import pyplot as plt
from pipeline import (
load_dataset,
filter_data,
filter_test_data,
prepare_user_data,
train_models,
evaluate_models,
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2
)
year_str = 'Year'
month_str = 'Month'
user_str = 'user'
split_str = 'split type'
data_split_str = 'data percentages'
month_split_str = 'month percentages'
threshold_str = 'threshold used'
with_threshold_str = 'WITH'
without_threshold_str = 'WITHOUT'
timespan_str = 'time used'
hour_timespan_str = '1HR'
min_timespan_str = '15MIN'
sequence_length_str = 'sequence length'
precision_str = 'precision'
recall_str = 'recall'
f1_string = 'f1 score'
model_type_str = 'model type'
weak_column_names = ['DayOfWeek_'+day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
figure_path = 'figures/'
# === Configurable Parameters ===
dataset_path = './Datasets/'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
result_filename_v1 = './working/evaluation_results.json'
result_filename_v2 = './working/evaluation_results_v2.json'
SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30]
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
VALIDATION_SCENARIO = [(2019, [10, 11, 12])]
TEST_SCENARIO = [(2020, [1, 2])] # Jan–Feb 2020 only
# === Optional display only ===
predefined_training_scenarios = {
"Scenario 1": {"years_months": [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]},
"Scenario 2": {"years_months": [(2017, list(range(1, 13))), (2018, list(range(1, 13))), (2019, list(range(1, 10)))]}
}
predefined_validation_scenarios = {
"Scenario A": {"years_months": [(2019, [10, 11, 12])]}
}
def remove_covid_data(df):
df = df[~((df[year_str]==2020) & (df[month_str]>2))]
return df
def split_data_by_month_percentage(df, percentages):
train_p, valid_p, test_p = percentages
ids = df[[year_str, month_str]].drop_duplicates().sort_values([year_str, month_str])
tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')
def split_data_by_userdata_percentage(df, percentages):
train_p, valid_p, test_p = percentages
tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for user_id in df[user_str].unique():
user_data = df[df[user_str]==user_id].sort_values([year_str, month_str])
u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))])
tr = pd.concat([tr, u_tr], ignore_index=True)
va = pd.concat([va, u_va], ignore_index=True)
te = pd.concat([te, u_te], ignore_index=True)
return tr, va, te
def main():
# print("=== Training Scenario Setup ===")
# display_warning_about_2020_data()
# display_warnings_for_scenarios("training", predefined_training_scenarios, predefined_validation_scenarios)
# print("\n=== Validation Scenario Setup ===")
# display_warning_about_2020_data()
# display_warnings_for_scenarios("validation", predefined_training_scenarios, predefined_validation_scenarios)
# === Load and preprocess ===
df = load_dataset(DATA_PATH)
ALLUSERS32_15MIN_WITHOUTTHREHOLD = False
if('ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' in DATA_PATH):
ALLUSERS32_15MIN_WITHOUTTHREHOLD = True
training_data = filter_data(df, TRAINING_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
validation_data = filter_data(df, VALIDATION_SCENARIO, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
user_data_train = prepare_user_data(training_data)
user_data_val = prepare_user_data(validation_data)
# === Train models ===
best_models = train_models(user_data_train, user_data_val, sequence_lengths=SEQUENCE_LENGTHS)
# === Load and evaluate test ===
test_df = filter_test_data(df, TEST_SCENARIO)
evaluate_models(best_models, test_df, SEQUENCE_LENGTHS, OUTPUT_EXCEL_PATH, ALLUSERS32_15MIN_WITHOUTTHREHOLD)
print(f"\n✅ All evaluations completed. Results saved to: {OUTPUT_EXCEL_PATH}")
def reduce_columns(df, filename):
if min_timespan_str in filename:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
else:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
def load_previous_results(filename):
results = pd.DataFrame()
if os.path.exists(filename):
results = pd.DataFrame(json.load(open(filename)))
return results
def main_two_v2(model_type):
seq_length = range(20,31, 10)
for sequence_length in seq_length:
for data_filename in os.listdir(dataset_path):
timespan_id = hour_timespan_str
threshold_id = with_threshold_str
if min_timespan_str in data_filename:
timespan_id = min_timespan_str
if without_threshold_str in data_filename:
threshold_id = without_threshold_str
results = load_previous_results(result_filename_v2)
if len(results) > 0:
if len(results[(results[timespan_str]==timespan_id) &
(results[threshold_str]==threshold_id) &
(results[sequence_length_str]==sequence_length) &
(results[model_type_str]==model_type)]) > 0:
continue
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10))
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_models = train_models_v2(user_data_train, user_data_val,
sequence_length=sequence_length,
model_type=model_type)
results = load_previous_results(result_filename_v2)
results = pd.concat([results,
evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
test_df=te,
sequence_length=sequence_length,
time_span_id=timespan_id,
threshold_id=threshold_id,
model_type=model_type,
split_id=data_split_str)],
ignore_index=True)
results.to_json(result_filename_v2)
def main_two_v1():
seq_length = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30]
results = pd.DataFrame()
if os.path.exists(result_filename_v1):
results = pd.DataFrame(json.load(open(result_filename_v1)))
for sequence_length in seq_length:
for data_filename in os.listdir(dataset_path):
for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]:
for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]:
timespan_id = hour_timespan_str
threshold_id = with_threshold_str
if min_timespan_str in data_filename:
timespan_id = min_timespan_str
if without_threshold_str in data_filename:
threshold_id = without_threshold_str
if len(results) > 0:
if len(results[(results[split_str]==split_id) &
(results[timespan_str]==timespan_id) &
(results[threshold_str]==threshold_id) &
(results[sequence_length_str]==sequence_length) &
(results[model_type_str]==model_type)]) > 0:
continue
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
tr,val,te = split_method(df, percentages=(80,10,10))
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length], model_type=model_type)
results = pd.concat([results,
evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
test_df=te, split_id=split_id,
sequence_length=sequence_length,
time_span_id=timespan_id,
threshold_id=threshold_id,
model_type=model_type)], ignore_index=True)
results.to_json(result_filename_v1)
# === Evaluation ===
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type):
user_data = prepare_user_data(test_df)
x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
y_pred = model.predict(x, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
recall = sklearn.metrics.recall_score(y, y_pred_classes, average='weighted')
precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted')
f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted')
return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id],
sequence_length_str:[sequence_length],
model_type_str:[model_type], recall_str:[recall],
precision_str:[precision], f1_string:[f1_score]})
def visualise_results_v1():
results = pd.DataFrame(json.load(open(result_filename_v1)))
# Month split ist immer schlechter
results = results[results[split_str] == data_split_str]
with_threshold = results[results[threshold_str] == with_threshold_str]
without_threshold = results[results[threshold_str] == without_threshold_str]
fig, axes = plt.subplots(2, 3)
ax_col_id = 0
ax_row_id = -1
for timespan in [hour_timespan_str,min_timespan_str]:
ax_row_id +=1
for model in [model_type_lstm, model_type_bilstm, model_type_gru]:
with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)]
without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)]
ax = axes[ax_row_id, ax_col_id]
ax.set_title(model+' '+timespan)
ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str)
ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str)
ax.legend()
ax_col_id +=1
ax_col_id %= 3
fig.tight_layout()
fig.savefig(figure_path+'v1_results.svg')
# Fazit: keine eindeutig besseren Versionen erkennbar
if __name__ == "__main__":
# main_two_v1()
# visualise_results_v1()
main_two_v2(model_type=model_type_gru)
print('Done')