Browse Source

added new version to run with adjusted training process

master
Bianca Steffes 2 months ago
parent
commit
93211b811e
  1. 1877
      figures/v1_results.svg
  2. 123
      main.py
  3. 57
      pipeline.py
  4. 2
      requirements.txt

1877
figures/v1_results.svg
File diff suppressed because it is too large
View File

123
main.py

@ -4,6 +4,7 @@ import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import sklearn import sklearn
from matplotlib import pyplot as plt
from pipeline import ( from pipeline import (
load_dataset, load_dataset,
@ -12,15 +13,21 @@ from pipeline import (
prepare_user_data, prepare_user_data,
train_models, train_models,
evaluate_models, evaluate_models,
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2
) )
year_str = 'Year' year_str = 'Year'
month_str = 'Month' month_str = 'Month'
user_str = 'user' user_str = 'user'
split_str = 'split type' split_str = 'split type'
data_split_str = 'data percentages'
month_split_str = 'month percentages'
threshold_str = 'threshold used' threshold_str = 'threshold used'
with_threshold_str = 'WITH'
without_threshold_str = 'WITHOUT'
timespan_str = 'time used' timespan_str = 'time used'
hour_timespan_str = '1HR'
min_timespan_str = '15MIN'
sequence_length_str = 'sequence length' sequence_length_str = 'sequence length'
precision_str = 'precision' precision_str = 'precision'
recall_str = 'recall' recall_str = 'recall'
@ -28,12 +35,14 @@ f1_string = 'f1 score'
model_type_str = 'model type' model_type_str = 'model type'
weak_column_names = ['DayOfWeek_'+day for day in weak_column_names = ['DayOfWeek_'+day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
figure_path = 'figures/'
# === Configurable Parameters === # === Configurable Parameters ===
dataset_path = './Datasets/' dataset_path = './Datasets/'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
result_filename = './working/evaluation_results.json'
result_filename_v1 = './working/evaluation_results.json'
result_filename_v2 = './working/evaluation_results_v2.json'
SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30] SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30]
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))] TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
@ -104,26 +113,80 @@ def main():
def reduce_columns(df, filename): def reduce_columns(df, filename):
if '15MIN' in filename:
if min_timespan_str in filename:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore') return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
else: else:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
def main_two():
def load_previous_results(filename):
results = pd.DataFrame() results = pd.DataFrame()
if os.path.exists(result_filename):
results = pd.DataFrame(json.load(open(result_filename)))
for sequence_length in SEQUENCE_LENGTHS:
if os.path.exists(filename):
results = pd.DataFrame(json.load(open(filename)))
return results
def main_two_v2(model_type):
seq_length = range(20,31, 10)
for sequence_length in seq_length:
for data_filename in os.listdir(dataset_path):
timespan_id = hour_timespan_str
threshold_id = with_threshold_str
if min_timespan_str in data_filename:
timespan_id = min_timespan_str
if without_threshold_str in data_filename:
threshold_id = without_threshold_str
results = load_previous_results(result_filename_v2)
if len(results) > 0:
if len(results[(results[timespan_str]==timespan_id) &
(results[threshold_str]==threshold_id) &
(results[sequence_length_str]==sequence_length) &
(results[model_type_str]==model_type)]) > 0:
continue
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10))
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_models = train_models_v2(user_data_train, user_data_val,
sequence_length=sequence_length,
model_type=model_type)
results = load_previous_results(result_filename_v2)
results = pd.concat([results,
evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
test_df=te,
sequence_length=sequence_length,
time_span_id=timespan_id,
threshold_id=threshold_id,
model_type=model_type,
split_id=data_split_str)],
ignore_index=True)
results.to_json(result_filename_v2)
def main_two_v1():
seq_length = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30]
results = pd.DataFrame()
if os.path.exists(result_filename_v1):
results = pd.DataFrame(json.load(open(result_filename_v1)))
for sequence_length in seq_length:
for data_filename in os.listdir(dataset_path): for data_filename in os.listdir(dataset_path):
for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]:
for split_id, split_method in [(data_split_str, split_data_by_userdata_percentage),(month_split_str, split_data_by_month_percentage)]:
for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]: for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]:
timespan_id = '1HR'
threshold_id = 'WITH'
if '15MIN' in data_filename:
timespan_id = '15MIN'
if 'WITHOUT' in data_filename:
threshold_id = 'WITHOUT'
timespan_id = hour_timespan_str
threshold_id = with_threshold_str
if min_timespan_str in data_filename:
timespan_id = min_timespan_str
if without_threshold_str in data_filename:
threshold_id = without_threshold_str
if len(results) > 0: if len(results) > 0:
if len(results[(results[split_str]==split_id) & if len(results[(results[split_str]==split_id) &
(results[timespan_str]==timespan_id) & (results[timespan_str]==timespan_id) &
@ -152,8 +215,7 @@ def main_two():
time_span_id=timespan_id, time_span_id=timespan_id,
threshold_id=threshold_id, threshold_id=threshold_id,
model_type=model_type)], ignore_index=True) model_type=model_type)], ignore_index=True)
results.to_json(result_filename)
results.to_json(result_filename_v1)
# === Evaluation === # === Evaluation ===
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type): def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type):
@ -171,7 +233,34 @@ def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, thresh
model_type_str:[model_type], recall_str:[recall], model_type_str:[model_type], recall_str:[recall],
precision_str:[precision], f1_string:[f1_score]}) precision_str:[precision], f1_string:[f1_score]})
def visualise_results_v1():
results = pd.DataFrame(json.load(open(result_filename_v1)))
# Month split ist immer schlechter
results = results[results[split_str] == data_split_str]
with_threshold = results[results[threshold_str] == with_threshold_str]
without_threshold = results[results[threshold_str] == without_threshold_str]
fig, axes = plt.subplots(2, 3)
ax_col_id = 0
ax_row_id = -1
for timespan in [hour_timespan_str,min_timespan_str]:
ax_row_id +=1
for model in [model_type_lstm, model_type_bilstm, model_type_gru]:
with_sub = with_threshold[(with_threshold[timespan_str] == timespan) & (with_threshold[model_type_str] == model)]
without_sub = without_threshold[(without_threshold[timespan_str] == timespan) & (without_threshold[model_type_str] == model)]
ax = axes[ax_row_id, ax_col_id]
ax.set_title(model+' '+timespan)
ax.plot(with_sub[sequence_length_str], with_sub[f1_string], label=with_threshold_str)
ax.plot(without_sub[sequence_length_str], without_sub[f1_string], label=without_threshold_str)
ax.legend()
ax_col_id +=1
ax_col_id %= 3
fig.tight_layout()
fig.savefig(figure_path+'v1_results.svg')
# Fazit: keine eindeutig besseren Versionen erkennbar
if __name__ == "__main__": if __name__ == "__main__":
main_two()
# main_two_v1()
# visualise_results_v1()
main_two_v2(model_type=model_type_gru)
print('Done') print('Done')

57
pipeline.py

@ -1,7 +1,10 @@
import keras_tuner
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import shutil import shutil
import os import os
from keras.src.metrics import F1Score
from pandas import ExcelWriter from pandas import ExcelWriter
import keras_tuner as kt import keras_tuner as kt
from tensorflow.keras.models import Sequential from tensorflow.keras.models import Sequential
@ -11,7 +14,7 @@ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras_tuner import RandomSearch from keras_tuner import RandomSearch
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
epochs = 50
epochs = 30
model_type_gru = 'GRU' model_type_gru = 'GRU'
model_type_lstm = 'LSTM' model_type_lstm = 'LSTM'
model_type_bilstm = 'BiLSTM' model_type_bilstm = 'BiLSTM'
@ -139,6 +142,58 @@ def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./workin
return best_models return best_models
# === Training & Validation ===
def train_models_v2(user_data, user_data_val, sequence_length, model_type):
tuner_dir = "./working/tuner"
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
shutil.rmtree(tuner_dir, ignore_errors=True)
x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
x_val, y_val = prepare_data_for_model(user_data=user_data_val, sequence_length=sequence_length)
n_features = x.shape[2]
users = list(user_data.keys())
def build_model(hp):
model = Sequential()
if model_type==model_type_bilstm:
model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features))))
if model_type==model_type_lstm:
model.add(LSTM(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features)))
if model_type==model_type_gru:
model.add(GRU(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features)))
model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
model.add(Dense(len(users), activation='softmax'))
model.compile(
optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
return model
tuner = RandomSearch(
build_model,
objective='val_loss',
max_trials=100,
directory=tuner_dir,
)
tuner.search(x, y, epochs=epochs, validation_data=(x_val, y_val),
callbacks=[early_stopping, lr_scheduler])
best_hps = tuner.get_best_hyperparameters(1)[0]
best_model = tuner.hypermodel.build(best_hps)
best_model.fit(x, y, epochs=epochs, validation_data=(x_val, y_val),
callbacks=[early_stopping, lr_scheduler])
return best_model
# === Evaluation === # === Evaluation ===
def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD): def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD):
print("\n🧪 Evaluating on Test Data...") print("\n🧪 Evaluating on Test Data...")

2
requirements.txt

@ -46,3 +46,5 @@ tzdata==2025.2
urllib3==2.5.0 urllib3==2.5.0
Werkzeug==3.1.3 Werkzeug==3.1.3
wrapt==1.17.2 wrapt==1.17.2
matplotlib~=3.10.6
Loading…
Cancel
Save