Browse Source

added differentiation between 3 different models

master
Bianca Steffes 5 days ago
parent
commit
b1b63d416d
  1. 1
      .gitignore
  2. 78
      main.py
  3. 31
      pipeline.py

1
.gitignore

@ -140,3 +140,4 @@ cython_debug/
.idea
working/tuner
working

78
main.py

@ -12,8 +12,7 @@ from pipeline import (
prepare_user_data,
train_models,
evaluate_models,
display_warning_about_2020_data,
display_warnings_for_scenarios, prepare_data_for_model
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm
)
year_str = 'Year'
@ -26,6 +25,7 @@ sequence_length_str = 'sequence length'
precision_str = 'precision'
recall_str = 'recall'
f1_string = 'f1 score'
model_type_str = 'model type'
weak_column_names = ['DayOfWeek_'+day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
@ -34,7 +34,7 @@ dataset_path = './Datasets/'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
result_filename = './working/evaluation_results.json'
SEQUENCE_LENGTHS = [20, 15, 10, 5, 1] # You can add more: [20, 25, 30]
SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5] # You can add more: [20, 25, 30]
TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
VALIDATION_SCENARIO = [(2019, [10, 11, 12])]
@ -117,40 +117,47 @@ def main_two():
for sequence_length in SEQUENCE_LENGTHS:
for data_filename in os.listdir(dataset_path):
for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]:
timespan_id = '1HR'
threshold_id = 'WITH'
if '15MIN' in data_filename:
timespan_id = '15MIN'
if 'WITHOUT' in data_filename:
threshold_id = 'WITHOUT'
if len(results) > 0:
if len(results[(results[split_str]==split_id) &
(results[timespan_str]==timespan_id) &
(results[threshold_str]==threshold_id) &
(results[sequence_length_str]==sequence_length)]) > 0:
continue
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
tr,val,te = split_method(df, percentages=(80,10,10))
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length])
results = pd.concat([results,
evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id,
sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id)], ignore_index=True)
results.to_json(result_filename)
for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]:
timespan_id = '1HR'
threshold_id = 'WITH'
if '15MIN' in data_filename:
timespan_id = '15MIN'
if 'WITHOUT' in data_filename:
threshold_id = 'WITHOUT'
if len(results) > 0:
if len(results[(results[split_str]==split_id) &
(results[timespan_str]==timespan_id) &
(results[threshold_str]==threshold_id) &
(results[sequence_length_str]==sequence_length) &
(results[model_type_str]==model_type)]) > 0:
continue
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
df = df.head(1000) # TODO: remove
tr,val,te = split_method(df, percentages=(80,10,10))
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length], model_type=model_type)
results = pd.concat([results,
evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
test_df=te, split_id=split_id,
sequence_length=sequence_length,
time_span_id=timespan_id,
threshold_id=threshold_id,
model_type=model_type)], ignore_index=True)
results.to_json(result_filename)
# === Evaluation ===
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id):
def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type):
user_data = prepare_user_data(test_df)
x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)
@ -161,7 +168,8 @@ def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, thresh
precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted')
f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted')
return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id],
sequence_length_str:[sequence_length], recall_str:[recall],
sequence_length_str:[sequence_length],
model_type_str:[model_type], recall_str:[recall],
precision_str:[precision], f1_string:[f1_score]})

31
pipeline.py

@ -5,12 +5,17 @@ import os
from pandas import ExcelWriter
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras_tuner import RandomSearch
from sklearn.metrics import accuracy_score
epochs = 2#50 # TODO: change
model_type_gru = 'GRU'
model_type_lstm = 'LSTM'
model_type_bilstm = 'BiLSTM'
# === Display functions ===
def display_warning_about_2020_data():
print("\n⚠️ Warning: 2020 data after February is excluded due to COVID-19.")
@ -67,7 +72,7 @@ def prepare_data_for_model(user_data, sequence_length):
return X,y
# === Training & Validation ===
def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./working/tuner"):
def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./working/tuner", model_type=model_type_lstm):
best_models = {}
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
@ -88,8 +93,15 @@ def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./w
def build_model(hp):
model = Sequential()
model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features))))
if model_type==model_type_bilstm:
model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features))))
if model_type==model_type_lstm:
model.add(LSTM(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features)))
if model_type==model_type_gru:
model.add(GRU(units=hp.Int('units', 32, 256, step=2),
input_shape=(sequence_length, n_features)))
model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
model.add(Dense(len(users), activation='softmax'))
model.compile(
@ -102,18 +114,18 @@ def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./w
tuner = RandomSearch(
build_model,
objective='val_loss',
max_trials=30,
max_trials=2, #30, TODO: change
executions_per_trial=2,
directory=tuner_dir,
project_name=f'lstm_seq_{sequence_length}'
)
tuner.search(X, y, epochs=30, validation_data=(X_val, y_val),
callbacks=[early_stopping, lr_scheduler], verbose=1)
tuner.search(X, y, epochs=epochs, validation_data=(X_val, y_val),
callbacks=[early_stopping, lr_scheduler], verbose=0)
best_hps = tuner.get_best_hyperparameters(1)[0]
best_model = tuner.hypermodel.build(best_hps)
best_model.fit(X, y, epochs=30, validation_data=(X_val, y_val),
best_model.fit(X, y, epochs=epochs, validation_data=(X_val, y_val),
callbacks=[early_stopping, lr_scheduler], verbose=0)
best_models[sequence_length] = {
@ -174,12 +186,15 @@ def evaluate_model_on_test_data(model, test_df, sequence_length, excel_writer, A
y_pred = model.predict(X, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
# counts which class was predicted how often
unique_pred, counts_pred = np.unique(y_pred_classes, return_counts=True)
label_counts_pred = dict(zip(unique_pred, counts_pred))
# counts which class should have been predicted how often (only one class for the user)
unique_true, counts_true = np.unique(y_true, return_counts=True)
label_counts_true = dict(zip(unique_true, counts_true))
# the fraction of correctly classified samples
acc = accuracy_score(y_true, y_pred_classes)
if acc > 0.5:
accuracy_above_50 += 1

Loading…
Cancel
Save