|
|
@ -4,6 +4,7 @@ import os |
|
|
import numpy as np |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
import sklearn |
|
|
import sklearn |
|
|
|
|
|
from matplotlib import pyplot as plt |
|
|
|
|
|
|
|
|
from pipeline import ( |
|
|
from pipeline import ( |
|
|
load_dataset, |
|
|
load_dataset, |
|
|
@ -12,15 +13,21 @@ from pipeline import ( |
|
|
prepare_user_data, |
|
|
prepare_user_data, |
|
|
train_models, |
|
|
train_models, |
|
|
evaluate_models, |
|
|
evaluate_models, |
|
|
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm |
|
|
|
|
|
|
|
|
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2 |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
year_str = 'Year' |
|
|
year_str = 'Year' |
|
|
month_str = 'Month' |
|
|
month_str = 'Month' |
|
|
user_str = 'user' |
|
|
user_str = 'user' |
|
|
split_str = 'split type' |
|
|
split_str = 'split type' |
|
|
|
|
|
data_split_str = 'data percentages' |
|
|
|
|
|
month_split_str = 'month percentages' |
|
|
threshold_str = 'threshold used' |
|
|
threshold_str = 'threshold used' |
|
|
|
|
|
with_threshold_str = 'WITH' |
|
|
|
|
|
without_threshold_str = 'WITHOUT' |
|
|
timespan_str = 'time used' |
|
|
timespan_str = 'time used' |
|
|
|
|
|
hour_timespan_str = '1HR' |
|
|
|
|
|
min_timespan_str = '15MIN' |
|
|
sequence_length_str = 'sequence length' |
|
|
sequence_length_str = 'sequence length' |
|
|
precision_str = 'precision' |
|
|
precision_str = 'precision' |
|
|
recall_str = 'recall' |
|
|
recall_str = 'recall' |
|
|
@ -28,12 +35,14 @@ f1_string = 'f1 score' |
|
|
model_type_str = 'model type' |
|
|
model_type_str = 'model type' |
|
|
weak_column_names = ['DayOfWeek_'+day for day in |
|
|
weak_column_names = ['DayOfWeek_'+day for day in |
|
|
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] |
|
|
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] |
|
|
|
|
|
figure_path = 'figures/' |
|
|
|
|
|
|
|
|
# === Configurable Parameters === |
|
|
# === Configurable Parameters === |
|
|
dataset_path = './Datasets/' |
|
|
dataset_path = './Datasets/' |
|
|
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' |
|
|
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' |
|
|
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' |
|
|
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' |
|
|
result_filename = './working/evaluation_results.json' |
|
|
|
|
|
|
|
|
result_filename_v1 = './working/evaluation_results.json' |
|
|
|
|
|
result_filename_v2 = './working/evaluation_results_v2.json' |
|
|
SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] |
|
|
SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] |
|
|
|
|
|
|
|
|
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] |
|
|
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] |
|
|
@ -104,26 +113,80 @@ def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reduce_columns(df, filename): |
|
|
def reduce_columns(df, filename): |
|
|
if '15MIN' in filename: |
|
|
|
|
|
|
|
|
if min_timespan_str in filename: |
|
|
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore') |
|
|
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore') |
|
|
else: |
|
|
else: |
|
|
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') |
|
|
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main_two(): |
|
|
|
|
|
|
|
|
def load_previous_results(filename): |
|
|
results = pd.DataFrame() |
|
|
results = pd.DataFrame() |
|
|
if os.path.exists(result_filename): |
|
|
|
|
|
results = pd.DataFrame(json.load(open(result_filename))) |
|
|
|
|
|
for sequence_length in SEQUENCE_LENGTHS: |
|
|
|
|
|
|
|
|
if os.path.exists(filename): |
|
|
|
|
|
results = pd.DataFrame(json.load(open(filename))) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
def main_two_v2(model_type): |
|
|
|
|
|
seq_length = range(20,31, 10) |
|
|
|
|
|
for sequence_length in seq_length: |
|
|
|
|
|
for data_filename in os.listdir(dataset_path): |
|
|
|
|
|
timespan_id = hour_timespan_str |
|
|
|
|
|
threshold_id = with_threshold_str |
|
|
|
|
|
if min_timespan_str in data_filename: |
|
|
|
|
|
timespan_id = min_timespan_str |
|
|
|
|
|
if without_threshold_str in data_filename: |
|
|
|
|
|
threshold_id = without_threshold_str |
|
|
|
|
|
|
|
|
|
|
|
results = load_previous_results(result_filename_v2) |
|
|
|
|
|
if len(results) > 0: |
|
|
|
|
|
if len(results[(results[timespan_str]==timespan_id) & |
|
|
|
|
|
(results[threshold_str]==threshold_id) & |
|
|
|
|
|
(results[sequence_length_str]==sequence_length) & |
|
|
|
|
|
(results[model_type_str]==model_type)]) > 0: |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
file_path = os.path.join(dataset_path, data_filename) |
|
|
|
|
|
df = load_dataset(file_path) |
|
|
|
|
|
df = remove_covid_data(df) |
|
|
|
|
|
|
|
|
|
|
|
tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10)) |
|
|
|
|
|
tr = reduce_columns(tr, data_filename) |
|
|
|
|
|
val = reduce_columns(val, data_filename) |
|
|
|
|
|
te = reduce_columns(te, data_filename) |
|
|
|
|
|
|
|
|
|
|
|
user_data_train = prepare_user_data(tr) |
|
|
|
|
|
user_data_val = prepare_user_data(val) |
|
|
|
|
|
|
|
|
|
|
|
best_models = train_models_v2(user_data_train, user_data_val, |
|
|
|
|
|
sequence_length=sequence_length, |
|
|
|
|
|
model_type=model_type) |
|
|
|
|
|
|
|
|
|
|
|
results = load_previous_results(result_filename_v2) |
|
|
|
|
|
results = pd.concat([results, |
|
|
|
|
|
evaluate_model_on_test_data(model=best_models[sequence_length]['model'], |
|
|
|
|
|
test_df=te, |
|
|
|
|
|
sequence_length=sequence_length, |
|
|
|
|
|
time_span_id=timespan_id, |
|
|
|
|
|
threshold_id=threshold_id, |
|
|
|
|
|
model_type=model_type, |
|
|
|
|
|
split_id=data_split_str)], |
|
|
|
|
|
ignore_index=True) |
|
|
|
|
|
results.to_json(result_filename_v2) |
|
|
|
|
|
|
|
|
|
|
|
def main_two_v1(): |
|
|
|
|
|
seq_length = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] |
|
|
|
|
|
results = pd.DataFrame() |
|
|
|
|
|
if os.path.exists(result_filename_v1): |
|
|
|
|
|
results = pd.DataFrame(json.load(open(result_filename_v1))) |
|
|
|
|
|
for sequence_length in seq_length: |
|
|
for data_filename in os.listdir(dataset_path): |
|
|
for data_filename in os.listdir(dataset_path): |
|
|
for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]: |
|
|
|
|
|
|
|
|
for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]: |
|
|
for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]: |
|
|
for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]: |
|
|
timespan_id = '1HR' |
|
|
|
|
|
threshold_id = 'WITH' |
|
|
|
|
|
if '15MIN' in data_filename: |
|
|
|
|
|
timespan_id = '15MIN' |
|
|
|
|
|
if 'WITHOUT' in data_filename: |
|
|
|
|
|
threshold_id = 'WITHOUT' |
|
|
|
|
|
|
|
|
timespan_id = hour_timespan_str |
|
|
|
|
|
threshold_id = with_threshold_str |
|
|
|
|
|
if min_timespan_str in data_filename: |
|
|
|
|
|
timespan_id = min_timespan_str |
|
|
|
|
|
if without_threshold_str in data_filename: |
|
|
|
|
|
threshold_id = without_threshold_str |
|
|
if len(results) > 0: |
|
|
if len(results) > 0: |
|
|
if len(results[(results[split_str]==split_id) & |
|
|
if len(results[(results[split_str]==split_id) & |
|
|
(results[timespan_str]==timespan_id) & |
|
|
(results[timespan_str]==timespan_id) & |
|
|
@ -152,8 +215,7 @@ def main_two(): |
|
|
time_span_id=timespan_id, |
|
|
time_span_id=timespan_id, |
|
|
threshold_id=threshold_id, |
|
|
threshold_id=threshold_id, |
|
|
model_type=model_type)], ignore_index=True) |
|
|
model_type=model_type)], ignore_index=True) |
|
|
results.to_json(result_filename) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results.to_json(result_filename_v1) |
|
|
|
|
|
|
|
|
# === Evaluation === |
|
|
# === Evaluation === |
|
|
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type): |
|
|
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type): |
|
|
@ -171,7 +233,34 @@ def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, thresh |
|
|
model_type_str:[model_type], recall_str:[recall], |
|
|
model_type_str:[model_type], recall_str:[recall], |
|
|
precision_str:[precision], f1_string:[f1_score]}) |
|
|
precision_str:[precision], f1_string:[f1_score]}) |
|
|
|
|
|
|
|
|
|
|
|
def visualise_results_v1(): |
|
|
|
|
|
results = pd.DataFrame(json.load(open(result_filename_v1))) |
|
|
|
|
|
# Month split ist immer schlechter |
|
|
|
|
|
results = results[results[split_str] == data_split_str] |
|
|
|
|
|
with_threshold = results[results[threshold_str] == with_threshold_str] |
|
|
|
|
|
without_threshold = results[results[threshold_str] == without_threshold_str] |
|
|
|
|
|
fig, axes = plt.subplots(2, 3) |
|
|
|
|
|
ax_col_id = 0 |
|
|
|
|
|
ax_row_id = -1 |
|
|
|
|
|
for timespan in [hour_timespan_str,min_timespan_str]: |
|
|
|
|
|
ax_row_id +=1 |
|
|
|
|
|
for model in [model_type_lstm, model_type_bilstm, model_type_gru]: |
|
|
|
|
|
with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)] |
|
|
|
|
|
without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)] |
|
|
|
|
|
ax = axes[ax_row_id, ax_col_id] |
|
|
|
|
|
ax.set_title(model+' '+timespan) |
|
|
|
|
|
ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str) |
|
|
|
|
|
ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str) |
|
|
|
|
|
ax.legend() |
|
|
|
|
|
ax_col_id +=1 |
|
|
|
|
|
ax_col_id %= 3 |
|
|
|
|
|
fig.tight_layout() |
|
|
|
|
|
fig.savefig(figure_path+'v1_results.svg') |
|
|
|
|
|
# Fazit: keine eindeutig besseren Versionen erkennbar |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
|
main_two() |
|
|
|
|
|
|
|
|
# main_two_v1() |
|
|
|
|
|
# visualise_results_v1() |
|
|
|
|
|
main_two_v2(model_type=model_type_gru) |
|
|
print('Done') |
|
|
print('Done') |