Browse Source

Added Baselines + extended manual evaluation

master
Bianca Steffes 3 weeks ago
parent
commit
40b32c30d3
  1. 1
      .gitignore
  2. 57
      main.py
  3. 32
      pipeline.py

1
.gitignore

@ -142,3 +142,4 @@ cython_debug/
working/tuner
working
figures
baseline_results.json

57
main.py

@ -4,7 +4,10 @@ import os
import numpy as np
import pandas as pd
import sklearn
from keras.src.regularizers import L1L2
from matplotlib import pyplot as plt
from pandas import DataFrame
from sklearn.dummy import DummyClassifier
from pipeline import (
load_dataset,
@ -13,7 +16,8 @@ from pipeline import (
prepare_user_data,
train_models,
evaluate_models,
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model,
eval_metrics
)
year_str = 'Year'
@ -29,6 +33,7 @@ timespan_str = 'time used'
hour_timespan_str = '1HR'
min_timespan_str = '15MIN'
sequence_length_str = 'sequence length'
accuracy_str = 'accuracy'
precision_str = 'precision'
recall_str = 'recall'
f1_string = 'f1 score'
@ -332,7 +337,7 @@ def manual_tuning(model_type):
df = load_dataset(file_path)
df = remove_covid_data(df)
tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=20)
tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=100)
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
@ -342,15 +347,18 @@ def manual_tuning(model_type):
# fit and evaluate model
# config
repeats = 5
n_batch = 4
repeats = 3
n_batch = 1024
n_epochs = 500
n_neurons = 1
n_neurons = 16
l_rate = 1e-4
reg = L1L2(l1=0.0, l2=0.0)
history_list = list()
# run diagnostic tests
for i in range(repeats):
history = train_one_model(user_data_train, user_data_val, n_batch, n_epochs, n_neurons,
history = train_one_model(user_data_train, user_data_val, n_batch, n_epochs,
n_neurons, l_rate, reg,
sequence_length=sequence_length,
model_type=model_type)
history_list.append(history)
@ -358,11 +366,45 @@ def manual_tuning(model_type):
for history in history_list:
plt.plot(history['train_'+metric], color='blue')
plt.plot(history['test_'+metric], color='orange')
plt.savefig(figure_path+metric+'_epochs_diagnostic.png')
plt.savefig(figure_path+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
plt.clf()
print('Done')
def calculate_baselines():
file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
(min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'),
(min_timespan_str, without_threshold_str, 'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'),
(hour_timespan_str, without_threshold_str, 'ALLUSERS_32_1HR_WITHOUT_THRESHOLD.xlsx'),
]
baseline_res = pd.DataFrame()
for timespan_id, threshold_id, filename in file_combinations:
file_path = os.path.join(dataset_path, filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
_, _, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=20)
te = reduce_columns(te, filename)
user_data_te = prepare_user_data(te)
for sequence_length in range(5,30, 5):
x, y = prepare_data_for_model(user_data=user_data_te, sequence_length=sequence_length)
for strategy in ['most_frequent', 'stratified', 'uniform']:
cls = DummyClassifier(strategy=strategy)
cls.fit(x,y)
y_pred = cls.predict(x)
acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred)
baseline_res = pd.concat([baseline_res,
DataFrame({ 'strategy':[strategy], threshold_str:[threshold_id],
timespan_str:[timespan_id], sequence_length_str:[sequence_length],
accuracy_str:[acc],precision_str:[p],recall_str:[r],
f1_string:f1})], ignore_index=True)
baseline_res.to_json('baseline_results.json')
print('Done')
if __name__ == "__main__":
# main_two_v1()
# visualise_results_v1()
@ -370,4 +412,5 @@ if __name__ == "__main__":
# main_two_v2(model_type=model_type_gru)
#visualise_results_v2()
manual_tuning(model_type=model_type_lstm)
#calculate_baselines()
print('Done')

32
pipeline.py

@ -1,3 +1,5 @@
import random
import keras_tuner
import numpy as np
import pandas as pd
@ -78,6 +80,8 @@ def prepare_data_for_model(user_data, sequence_length):
x_new, y_new = make_sequences(data, sequence_length)
x = x + x_new
y = y + y_new
random.Random(17).shuffle(x)
random.Random(17).shuffle(y)
x = np.array(x)
y = np.array(y)
return x,y
@ -205,7 +209,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
return tuner.get_best_models(num_models=1)[0]
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence_length, model_type):
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length, model_type):
x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
n_features = x.shape[2]
users = list(train_data.keys())
@ -213,7 +217,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence
# prepare model
def build_model():
model = Sequential()
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch, bias_regularizer=reg))
# if model_type == model_type_bilstm:
# model.add(Bidirectional(units=units_hp))
if model_type == model_type_lstm:
@ -225,7 +229,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence
# model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1)))
model.add(Dense(len(users), activation='softmax'))
model.compile(
optimizer=Adam(learning_rate=1e-5),
optimizer=Adam(learning_rate=l_rate),
loss=SparseCategoricalCrossentropy(),
metrics=[SparseCategoricalAccuracy()],
)
@ -234,21 +238,24 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, sequence
model = build_model()
# fit model
train_p, test_p, train_r, test_r, train_f1, test_f1 = list(), list(),list(), list(),list(), list()
train_acc, test_acc, train_p, test_p, train_r, test_r, train_f1, test_f1 = list(), list(),list(), list(),list(), list(),list(), list()
for i in range(n_epochs):
model.fit(x, y, batch_size=n_batch, epochs=1, verbose=0, shuffle=False)
# evaluate model on train data
p, r, f1 = evaluate(model, train_data, sequence_length, n_batch)
acc, p, r, f1 = evaluate(model, train_data, sequence_length, n_batch)
train_acc.append(acc)
train_p.append(p)
train_r.append(r)
train_f1.append(f1)
# evaluate model on test data
p, r, f1 = evaluate(model, val_data, sequence_length, n_batch)
acc, p, r, f1 = evaluate(model, val_data, sequence_length, n_batch)
test_acc.append(acc)
test_p.append(p)
test_r.append(r)
test_f1.append(f1)
history = DataFrame()
history['train_acc'], history['test_acc'] = train_acc, test_acc
history['train_p'], history['test_p'] = train_p, test_p
history['train_r'], history['test_r'] = train_r, test_r
history['train_f1'], history['test_f1'] = train_f1, test_f1
@ -262,11 +269,16 @@ def evaluate(model, df, sequence_length, batch_size):
y_pred = model.predict(x, verbose=0, batch_size=batch_size)
y_pred_classes = np.argmax(y_pred, axis=1)
f1 = f1_score(y_true=y_true, y_pred=y_pred_classes, average='weighted')
p = precision_score(y_true=y_true, y_pred=y_pred_classes, average='weighted')
r = recall_score(y_true=y_true, y_pred=y_pred_classes, average='weighted')
return p, r, f1
return eval_metrics(y_true=y_true, y_pred=y_pred_classes)
def eval_metrics(y_true, y_pred):
f1 = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
p = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
r = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
acc = accuracy_score(y_true=y_true, y_pred=y_pred)
return acc, p, r, f1
# === Evaluation ===
def evaluate_models(best_models, df_test, sequence_lengths, output_excel_path, ALLUSERS32_15MIN_WITHOUTTHREHOLD):

Loading…
Cancel
Save