added differentiation between 3 different models

3 months ago · b1b63d416d
3 changed files with 67 additions and 43 deletions
--- a/.gitignore
+++ b/.gitignore
@ -140,3 +140,4 @@ cython_debug/

 .idea
 working/tuner
+working
--- a/main.py
+++ b/main.py
@ -12,8 +12,7 @@ from pipeline import (
    prepare_user_data,
    train_models,
    evaluate_models,
-    display_warning_about_2020_data,
-    display_warnings_for_scenarios, prepare_data_for_model
+    prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm
 )

 year_str = 'Year'
@ -26,6 +25,7 @@ sequence_length_str = 'sequence length'
 precision_str = 'precision'
 recall_str = 'recall'
 f1_string = 'f1 score'
+model_type_str = 'model type'
 weak_column_names = ['DayOfWeek_'+day for day in
                     ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]

@ -34,7 +34,7 @@ dataset_path = './Datasets/'
 DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
 OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
 result_filename = './working/evaluation_results.json'
-SEQUENCE_LENGTHS = [20, 15, 10, 5, 1]  # You can add more: [20, 25, 30]
+SEQUENCE_LENGTHS = [30, 25, 20, 15, 10, 5]  # You can add more: [20, 25, 30]

 TRAINING_SCENARIO = [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]
 VALIDATION_SCENARIO = [(2019, [10, 11, 12])]
@ -117,40 +117,47 @@ def main_two():
    for sequence_length in SEQUENCE_LENGTHS:
        for data_filename in os.listdir(dataset_path):
            for split_id, split_method in [('data percentages', split_data_by_userdata_percentage),('month percentages', split_data_by_month_percentage)]:
-                timespan_id = '1HR'
-                threshold_id = 'WITH'
-                if '15MIN' in data_filename:
-                    timespan_id = '15MIN'
-                if 'WITHOUT' in data_filename:
-                    threshold_id = 'WITHOUT'
-                if len(results) > 0:
-                    if len(results[(results[split_str]==split_id) &
-                                   (results[timespan_str]==timespan_id) &
-                                   (results[threshold_str]==threshold_id) &
-                                   (results[sequence_length_str]==sequence_length)]) > 0:
-                        continue
-
-                file_path = os.path.join(dataset_path, data_filename)
-                df = load_dataset(file_path)
-                df = remove_covid_data(df)
-                tr,val,te = split_method(df, percentages=(80,10,10))
-                tr = reduce_columns(tr, data_filename)
-                val = reduce_columns(val, data_filename)
-                te = reduce_columns(te, data_filename)
-
-                user_data_train = prepare_user_data(tr)
-                user_data_val = prepare_user_data(val)
-
-                best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length])
-
-                results = pd.concat([results,
-                                     evaluate_model_on_test_data(model=best_models[sequence_length]['model'], test_df=te, split_id=split_id,
-                                     sequence_length=sequence_length, time_span_id=timespan_id, threshold_id=threshold_id)], ignore_index=True)
-                results.to_json(result_filename)
+                for model_type in [model_type_lstm, model_type_bilstm, model_type_gru]:
+                    timespan_id = '1HR'
+                    threshold_id = 'WITH'
+                    if '15MIN' in data_filename:
+                        timespan_id = '15MIN'
+                    if 'WITHOUT' in data_filename:
+                        threshold_id = 'WITHOUT'
+                    if len(results) > 0:
+                        if len(results[(results[split_str]==split_id) &
+                                       (results[timespan_str]==timespan_id) &
+                                       (results[threshold_str]==threshold_id) &
+                                       (results[sequence_length_str]==sequence_length) &
+                                       (results[model_type_str]==model_type)]) > 0:
+                            continue
+
+                    file_path = os.path.join(dataset_path, data_filename)
+                    df = load_dataset(file_path)
+                    df = remove_covid_data(df)
+                    df = df.head(1000) # TODO: remove
+                    tr,val,te = split_method(df, percentages=(80,10,10))
+                    tr = reduce_columns(tr, data_filename)
+                    val = reduce_columns(val, data_filename)
+                    te = reduce_columns(te, data_filename)
+
+                    user_data_train = prepare_user_data(tr)
+                    user_data_val = prepare_user_data(val)
+
+                    best_models = train_models(user_data_train, user_data_val, sequence_lengths=[sequence_length], model_type=model_type)
+
+                    results = pd.concat([results,
+                                         evaluate_model_on_test_data(model=best_models[sequence_length]['model'],
+                                                                     test_df=te, split_id=split_id,
+                                                                     sequence_length=sequence_length,
+                                                                     time_span_id=timespan_id,
+                                                                     threshold_id=threshold_id,
+                                                                     model_type=model_type)], ignore_index=True)
+                    results.to_json(result_filename)


 # === Evaluation ===
-def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id):
+def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, threshold_id, time_span_id, model_type):
    user_data = prepare_user_data(test_df)
    x, y = prepare_data_for_model(user_data=user_data, sequence_length=sequence_length)

@ -161,7 +168,8 @@ def evaluate_model_on_test_data(model, test_df,sequence_length, split_id, thresh
    precision = sklearn.metrics.precision_score(y, y_pred_classes, average='weighted')
    f1_score = sklearn.metrics.f1_score(y, y_pred_classes, average='weighted')
    return pd.DataFrame({split_str:[split_id], threshold_str:[threshold_id], timespan_str:[time_span_id],
-                         sequence_length_str:[sequence_length], recall_str:[recall],
+                         sequence_length_str:[sequence_length],
+                         model_type_str:[model_type], recall_str:[recall],
                         precision_str:[precision], f1_string:[f1_score]})


--- a/pipeline.py
+++ b/pipeline.py
@ -5,12 +5,17 @@ import os
 from pandas import ExcelWriter
 import keras_tuner as kt
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
+from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
 from keras_tuner import RandomSearch
 from sklearn.metrics import accuracy_score

+epochs = 2#50 # TODO: change
+model_type_gru = 'GRU'
+model_type_lstm = 'LSTM'
+model_type_bilstm = 'BiLSTM'
+
 # === Display functions ===
 def display_warning_about_2020_data():
    print("\n⚠️ Warning: 2020 data after February is excluded due to COVID-19.")
@ -67,7 +72,7 @@ def prepare_data_for_model(user_data, sequence_length):
    return X,y

 # === Training & Validation ===
-def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./working/tuner"):
+def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./working/tuner", model_type=model_type_lstm):
    best_models = {}
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
@ -88,8 +93,15 @@ def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./w

        def build_model(hp):
            model = Sequential()
-            model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
-                                         input_shape=(sequence_length, n_features))))
+            if model_type==model_type_bilstm:
+                model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
+                                             input_shape=(sequence_length, n_features))))
+            if model_type==model_type_lstm:
+                model.add(LSTM(units=hp.Int('units', 32, 256, step=2),
+                                             input_shape=(sequence_length, n_features)))
+            if model_type==model_type_gru:
+                model.add(GRU(units=hp.Int('units', 32, 256, step=2),
+                                             input_shape=(sequence_length, n_features)))
            model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
            model.add(Dense(len(users), activation='softmax'))
            model.compile(
@ -102,18 +114,18 @@ def train_models(user_data, user_data_val, sequence_lengths=[20], tuner_dir="./w
        tuner = RandomSearch(
            build_model,
            objective='val_loss',
-            max_trials=30,
+            max_trials=2, #30, TODO: change
            executions_per_trial=2,
            directory=tuner_dir,
            project_name=f'lstm_seq_{sequence_length}'
        )

-        tuner.search(X, y, epochs=30, validation_data=(X_val, y_val),
-                     callbacks=[early_stopping, lr_scheduler], verbose=1)
+        tuner.search(X, y, epochs=epochs, validation_data=(X_val, y_val),
+                     callbacks=[early_stopping, lr_scheduler], verbose=0)

        best_hps = tuner.get_best_hyperparameters(1)[0]
        best_model = tuner.hypermodel.build(best_hps)
-        best_model.fit(X, y, epochs=30, validation_data=(X_val, y_val),
+        best_model.fit(X, y, epochs=epochs, validation_data=(X_val, y_val),
                       callbacks=[early_stopping, lr_scheduler], verbose=0)

        best_models[sequence_length] = {
@ -174,12 +186,15 @@ def evaluate_model_on_test_data(model, test_df, sequence_length, excel_writer, A
        y_pred = model.predict(X, verbose=0)
        y_pred_classes = np.argmax(y_pred, axis=1)

+        # counts which class was predicted how often
        unique_pred, counts_pred = np.unique(y_pred_classes, return_counts=True)
        label_counts_pred = dict(zip(unique_pred, counts_pred))

+        # counts which class should have been predicted how often (only one class for the user)
        unique_true, counts_true = np.unique(y_true, return_counts=True)
        label_counts_true = dict(zip(unique_true, counts_true))

+        # the fraction of correctly classified samples
        acc = accuracy_score(y_true, y_pred_classes)
        if acc > 0.5:
            accuracy_above_50 += 1