Browse Source

Updated code for different tests

master
Bianca Steffes 2 weeks ago
parent
commit
98831ce4f3
  1. 1
      environment.yaml
  2. 92
      main.py
  3. 46
      pipeline.py

1
environment.yaml

@ -27,6 +27,7 @@ dependencies:
- Markdown==3.8.2 - Markdown==3.8.2
- markdown-it-py==3.0.0 - markdown-it-py==3.0.0
- MarkupSafe==3.0.2 - MarkupSafe==3.0.2
- matplotlib
- mdurl==0.1.2 - mdurl==0.1.2
- ml_dtypes==0.5.1 - ml_dtypes==0.5.1
- namex==0.1.0 - namex==0.1.0

92
main.py

@ -1,6 +1,7 @@
import json import json
import os import os
import math
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import sklearn import sklearn
@ -18,7 +19,7 @@ from pipeline import (
train_models, train_models,
evaluate_models, evaluate_models,
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model, prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model,
eval_metrics
eval_metrics, get_save_id
) )
year_str = 'Year' year_str = 'Year'
@ -45,6 +46,7 @@ model_type_str = 'model type'
week_column_names = ['DayOfWeek_' + day for day in week_column_names = ['DayOfWeek_' + day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
figure_path = 'figures/' figure_path = 'figures/'
predicitons_path = 'preds/'
# === Configurable Parameters === # === Configurable Parameters ===
dataset_path = './Datasets/' dataset_path = './Datasets/'
@ -69,8 +71,17 @@ predefined_validation_scenarios = {
"Scenario A": {"years_months": [(2019, [10, 11, 12])]} "Scenario A": {"years_months": [(2019, [10, 11, 12])]}
} }
def create_dir(path):
"""
Creates a directory if it doesn't exist yet.
:param path: The path to the directory
"""
if not os.path.exists(path):
os.makedirs(path)
def remove_covid_data(df): def remove_covid_data(df):
df = df[~((df[year_str]==2020) & (df[month_str]>2))]
df = df[~(df[year_str]>=2020)]
return df return df
def split_data_by_month_percentage(df, percentages): def split_data_by_month_percentage(df, percentages):
@ -381,9 +392,24 @@ def manual_tuning(model_type):
print('Done') print('Done')
def upsampling(df):
max_user_data = df[user_str].value_counts().max()
for user in df[user_str].unique():
user_data = df[df[user_str]==user]
user_count = user_data.shape[0]
times = max_user_data / user_count
before_comma = math.floor(times)
after_comma = times % 1
after_comma_data = user_data.sample(frac=after_comma)
for i in range(1, before_comma):
df = pd.concat([df, user_data], ignore_index=True)
df = pd.concat([df, after_comma_data], ignore_index=True)
return df
def manual_tuning_v3(model_type): def manual_tuning_v3(model_type):
# TODO: hrs/min + different sequence lengths # TODO: hrs/min + different sequence lengths
sequence_length = 20
sequence_length = 7
tr, val, te = get_prepared_data_v3(dataset_hrs_path) tr, val, te = get_prepared_data_v3(dataset_hrs_path)
@ -391,28 +417,37 @@ def manual_tuning_v3(model_type):
# config # config
repeats = 3 repeats = 3
n_batch = 1024 n_batch = 1024
n_epochs = 500
n_neurons = 16
l_rate = 1e-4
n_epochs = 200
n_neurons = 256
n_neurons2 = 512
n_neurons3 = 512
n_neurons4 = 128
l_rate = 1e-2
d1 = 256
reg1 = L1L2(l1=0.0, l2=0.001)
r1 = '0001'
reg2 = L1L2(l1=0.0, l2=0.1)
r2 = '01'
history_list = list() history_list = list()
# run diagnostic tests # run diagnostic tests
for i in range(repeats): for i in range(repeats):
history = train_one_model(tr, val, n_batch, n_epochs, history = train_one_model(tr, val, n_batch, n_epochs,
n_neurons, l_rate,
n_neurons,n_neurons2, n_neurons3, n_neurons4, l_rate, d1, r1, reg1, r2, reg2,
sequence_length=sequence_length, sequence_length=sequence_length,
model_type=model_type) model_type=model_type)
history_list.append(history) history_list.append(history)
for metric in ['p', 'r', 'f1']:
for metric in ['acc', 'p', 'r', 'f1']:
for history in history_list: for history in history_list:
plt.plot(history['train_'+metric], color='blue') plt.plot(history['train_'+metric], color='blue')
plt.plot(history['test_'+metric], color='orange') plt.plot(history['test_'+metric], color='orange')
plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
plt.savefig(figure_path+'v3/'+metric+get_save_id(n_epochs, n_neurons, n_neurons2, n_neurons3,n_neurons4, n_batch, l_rate, d1, r1, r2)
+'.png')
plt.clf() plt.clf()
print('Done') print('Done')
def calculate_baselines(): def calculate_baselines():
file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'), file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
(min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'), (min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'),
@ -448,23 +483,51 @@ def get_prepared_data_v3(filename, sample=100):
df = pd.read_json(filename) df = pd.read_json(filename)
df = remove_covid_data(df) df = remove_covid_data(df)
tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample)
# remove users with too little data
value_counts = df[user_str].value_counts()
df = df[df[user_str].isin(value_counts[value_counts>1000].index)]
adjusted_df = pd.DataFrame()
# adjust labels
new_id = 0
for user_id in df[user_str].unique():
user_data = df[df[user_str]==user_id]
user_data[user_str] = new_id
adjusted_df = pd.concat([adjusted_df, user_data], ignore_index=True)
new_id += 1
# bin steps per hour TODO: adjust for minutes
for hour in ['Hour_'+str(i) for i in range(24)]:
hour_data = adjusted_df[hour]
# smaller 1000 - round to 10
a = ((hour_data[hour_data<1000]/10).round()*10)
# between 1000 and 10000 - round to next 100
b = ((hour_data[(hour_data>=1000)& (hour_data<10000)]/100).round()*100)
# higher or equal 10000 - one class
c = hour_data[hour_data > 10000]
c = pd.Series(data={ind:10000 for ind in c.index}, index=c.index)
new = pd.concat([a, b, c]).sort_index().astype(int)
adjusted_df[hour] = new
tr, val, te = split_data_by_userdata_percentage(adjusted_df, percentages=(70, 15, 15), sample=sample)
tr = reduce_columns_v3(tr) tr = reduce_columns_v3(tr)
val = reduce_columns_v3(val) val = reduce_columns_v3(val)
te = reduce_columns_v3(te) te = reduce_columns_v3(te)
scaler = MinMaxScaler() scaler = MinMaxScaler()
scaler.fit(tr.drop(columns=[user_str])) scaler.fit(tr.drop(columns=[user_str]))
return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te) return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te)
def scale_dataset(scaler, df): def scale_dataset(scaler, df):
y = df[user_str] y = df[user_str]
x_scaled = scaler.transform(df.drop(columns=[user_str])) x_scaled = scaler.transform(df.drop(columns=[user_str]))
df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1) df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1)
df_scaled.columns = df.columns df_scaled.columns = df.columns
return prepare_user_data(df)
return prepare_user_data(df_scaled)
def calculate_baselines_v3(): def calculate_baselines_v3():
@ -498,5 +561,6 @@ if __name__ == "__main__":
#visualise_results_v2() #visualise_results_v2()
#manual_tuning(model_type=model_type_lstm) #manual_tuning(model_type=model_type_lstm)
#calculate_baselines() #calculate_baselines()
calculate_baselines_v3()
print('Done')
#calculate_baselines_v3()
manual_tuning_v3(model_type=model_type_lstm)
print('Done') # TODO: unterschiedlich große Datenmengen als ein Problem (auch in der Evaluation)

46
pipeline.py

@ -14,13 +14,14 @@ from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU
from tensorflow.keras.optimizers import Adam from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras_tuner import RandomSearch from keras_tuner import RandomSearch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
epochs = 5#50 epochs = 5#50
model_type_gru = 'GRU' model_type_gru = 'GRU'
model_type_lstm = 'LSTM' model_type_lstm = 'LSTM'
model_type_bilstm = 'BiLSTM' model_type_bilstm = 'BiLSTM'
# === Display functions === # === Display functions ===
def display_warning_about_2020_data(): def display_warning_about_2020_data():
print("\n⚠️ Warning: 2020 data after February is excluded due to COVID-19.") print("\n⚠️ Warning: 2020 data after February is excluded due to COVID-19.")
@ -67,11 +68,11 @@ def prepare_user_data(df):
def make_sequences(data, sequence_length): def make_sequences(data, sequence_length):
x, y = [], [] x, y = [], []
features = data.drop('user', axis=1).values features = data.drop('user', axis=1).values
features = features.astype(int)
#features = features.astype(int)
labels = data['user'].values labels = data['user'].values
for i in range(len(features) - sequence_length):
for i in range(len(features) - sequence_length+1):
x.append(features[i:i + sequence_length]) x.append(features[i:i + sequence_length])
y.append(labels[i + sequence_length])
y.append(labels[i + sequence_length-1])
return x, y return x, y
def prepare_data_for_model(user_data, sequence_length): def prepare_data_for_model(user_data, sequence_length):
@ -209,7 +210,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
return tuner.get_best_models(num_models=1)[0] return tuner.get_best_models(num_models=1)[0]
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, sequence_length, model_type):
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons,n_neurons2,n_neurons3,n_neurons4, l_rate, d1, r1, reg1, r2, reg2, sequence_length, model_type):
x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length) x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
n_features = x.shape[2] n_features = x.shape[2]
users = list(train_data.keys()) users = list(train_data.keys())
@ -218,15 +219,19 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
def build_model(): def build_model():
model = Sequential() model = Sequential()
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch)) model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
# if model_type == model_type_bilstm:
# model.add(Bidirectional(units=units_hp))
if model_type == model_type_bilstm:
model.add(Bidirectional(LSTM(n_neurons)))
if model_type == model_type_lstm: if model_type == model_type_lstm:
# model.add(LSTM(n_neurons, kernel_regularizer=reg1, return_sequences=True))
model.add(LSTM(n_neurons)) model.add(LSTM(n_neurons))
# if model_type == model_type_gru:
# model.add(GRU(units=units_hp))
# model.add(LSTM(n_neurons2))
# model.add(LSTM(n_neurons3, return_sequences=True))
# model.add(LSTM(n_neurons4))
if model_type == model_type_gru:
model.add(GRU(n_neurons))
# TODO: add another dense layer # TODO: add another dense layer
#model.add(Dense(256, activation='relu'))
# model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1)))
#model.add(Dense(n_neurons, activation='relu'))
#model.add(Dropout(d1))
model.add(Dense(len(users), activation='softmax')) model.add(Dense(len(users), activation='softmax'))
model.compile( model.compile(
optimizer=Adam(learning_rate=l_rate), optimizer=Adam(learning_rate=l_rate),
@ -248,7 +253,8 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
train_r.append(r) train_r.append(r)
train_f1.append(f1) train_f1.append(f1)
# evaluate model on test data # evaluate model on test data
acc, p, r, f1 = evaluate(model, val_data, sequence_length, n_batch)
savename = 'cf_matrix_'+get_save_id(n_epochs, n_neurons, n_neurons2,n_neurons3, n_neurons4, n_batch, l_rate,d1,r1, r2)+'.json'
acc, p, r, f1 = evaluate(model, val_data, sequence_length, n_batch, save_name=savename)
test_acc.append(acc) test_acc.append(acc)
test_p.append(p) test_p.append(p)
test_r.append(r) test_r.append(r)
@ -262,13 +268,27 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
return history return history
def evaluate(model, df, sequence_length, batch_size):
def get_save_id(n_epochs, n_neurons, n_neurons2,n_neurons3,n_neurons4, n_batch, l_rate, d1,r1, r2):
return '_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+ str(n_batch)
#'x'+str(n_neurons3)+'x'+str(n_neurons4)
#+'_l'+str(l_rate)+'_r'+str(r1)+'xx'+str(r2)
def evaluate(model, df, sequence_length, batch_size, save_name=None):
x, y = prepare_data_for_model(user_data=df, sequence_length=sequence_length) x, y = prepare_data_for_model(user_data=df, sequence_length=sequence_length)
x = np.array(x) x = np.array(x)
y_true = np.array(y) y_true = np.array(y)
y_pred = model.predict(x, verbose=0, batch_size=batch_size) y_pred = model.predict(x, verbose=0, batch_size=batch_size)
y_pred_classes = np.argmax(y_pred, axis=1) y_pred_classes = np.argmax(y_pred, axis=1)
cf_matrix = pd.DataFrame(confusion_matrix(y_true, y_pred_classes))
if save_name is not None:
cf_matrix.to_json('results/'+save_name)
true_counts = pd.DataFrame(y).value_counts()
print('Top true occurrences', true_counts[:6])
predicted_counts = pd.DataFrame(y_pred_classes).value_counts()
print('Top predicted occurrences', predicted_counts[:6])
return eval_metrics(y_true=y_true, y_pred=y_pred_classes) return eval_metrics(y_true=y_true, y_pred=y_pred_classes)

Loading…
Cancel
Save