|
|
|
@ -1,6 +1,7 @@ |
|
|
|
import json |
|
|
|
import os |
|
|
|
|
|
|
|
import math |
|
|
|
import numpy as np |
|
|
|
import pandas as pd |
|
|
|
import sklearn |
|
|
|
@ -18,7 +19,7 @@ from pipeline import ( |
|
|
|
train_models, |
|
|
|
evaluate_models, |
|
|
|
prepare_data_for_model, model_type_gru, model_type_lstm, model_type_bilstm, train_models_v2, train_one_model, |
|
|
|
eval_metrics |
|
|
|
eval_metrics, get_save_id |
|
|
|
) |
|
|
|
|
|
|
|
year_str = 'Year' |
|
|
|
@ -45,6 +46,7 @@ model_type_str = 'model type' |
|
|
|
week_column_names = ['DayOfWeek_' + day for day in |
|
|
|
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] |
|
|
|
figure_path = 'figures/' |
|
|
|
predicitons_path = 'preds/' |
|
|
|
|
|
|
|
# === Configurable Parameters === |
|
|
|
dataset_path = './Datasets/' |
|
|
|
@ -69,8 +71,17 @@ predefined_validation_scenarios = { |
|
|
|
"Scenario A": {"years_months": [(2019, [10, 11, 12])]} |
|
|
|
} |
|
|
|
|
|
|
|
def create_dir(path): |
|
|
|
""" |
|
|
|
Creates a directory if it doesn't exist yet. |
|
|
|
|
|
|
|
:param path: The path to the directory |
|
|
|
""" |
|
|
|
if not os.path.exists(path): |
|
|
|
os.makedirs(path) |
|
|
|
|
|
|
|
def remove_covid_data(df): |
|
|
|
df = df[~((df[year_str]==2020) & (df[month_str]>2))] |
|
|
|
df = df[~(df[year_str]>=2020)] |
|
|
|
return df |
|
|
|
|
|
|
|
def split_data_by_month_percentage(df, percentages): |
|
|
|
@ -381,9 +392,24 @@ def manual_tuning(model_type): |
|
|
|
print('Done') |
|
|
|
|
|
|
|
|
|
|
|
def upsampling(df): |
|
|
|
max_user_data = df[user_str].value_counts().max() |
|
|
|
for user in df[user_str].unique(): |
|
|
|
user_data = df[df[user_str]==user] |
|
|
|
user_count = user_data.shape[0] |
|
|
|
times = max_user_data / user_count |
|
|
|
before_comma = math.floor(times) |
|
|
|
after_comma = times % 1 |
|
|
|
after_comma_data = user_data.sample(frac=after_comma) |
|
|
|
for i in range(1, before_comma): |
|
|
|
df = pd.concat([df, user_data], ignore_index=True) |
|
|
|
df = pd.concat([df, after_comma_data], ignore_index=True) |
|
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
def manual_tuning_v3(model_type): |
|
|
|
# TODO: hrs/min + different sequence lengths |
|
|
|
sequence_length = 20 |
|
|
|
sequence_length = 7 |
|
|
|
|
|
|
|
tr, val, te = get_prepared_data_v3(dataset_hrs_path) |
|
|
|
|
|
|
|
@ -391,28 +417,37 @@ def manual_tuning_v3(model_type): |
|
|
|
# config |
|
|
|
repeats = 3 |
|
|
|
n_batch = 1024 |
|
|
|
n_epochs = 500 |
|
|
|
n_neurons = 16 |
|
|
|
l_rate = 1e-4 |
|
|
|
n_epochs = 200 |
|
|
|
n_neurons = 256 |
|
|
|
n_neurons2 = 512 |
|
|
|
n_neurons3 = 512 |
|
|
|
n_neurons4 = 128 |
|
|
|
l_rate = 1e-2 |
|
|
|
d1 = 256 |
|
|
|
reg1 = L1L2(l1=0.0, l2=0.001) |
|
|
|
r1 = '0001' |
|
|
|
reg2 = L1L2(l1=0.0, l2=0.1) |
|
|
|
r2 = '01' |
|
|
|
|
|
|
|
history_list = list() |
|
|
|
# run diagnostic tests |
|
|
|
for i in range(repeats): |
|
|
|
history = train_one_model(tr, val, n_batch, n_epochs, |
|
|
|
n_neurons, l_rate, |
|
|
|
n_neurons,n_neurons2, n_neurons3, n_neurons4, l_rate, d1, r1, reg1, r2, reg2, |
|
|
|
sequence_length=sequence_length, |
|
|
|
model_type=model_type) |
|
|
|
history_list.append(history) |
|
|
|
for metric in ['p', 'r', 'f1']: |
|
|
|
for metric in ['acc', 'p', 'r', 'f1']: |
|
|
|
for history in history_list: |
|
|
|
plt.plot(history['train_'+metric], color='blue') |
|
|
|
plt.plot(history['test_'+metric], color='orange') |
|
|
|
plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+ |
|
|
|
str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png') |
|
|
|
plt.savefig(figure_path+'v3/'+metric+get_save_id(n_epochs, n_neurons, n_neurons2, n_neurons3,n_neurons4, n_batch, l_rate, d1, r1, r2) |
|
|
|
+'.png') |
|
|
|
plt.clf() |
|
|
|
print('Done') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_baselines(): |
|
|
|
file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'), |
|
|
|
(min_timespan_str, with_threshold_str, 'ALL32USERS15MIN_WITHTHRESHOLD.xlsx'), |
|
|
|
@ -448,23 +483,51 @@ def get_prepared_data_v3(filename, sample=100): |
|
|
|
df = pd.read_json(filename) |
|
|
|
df = remove_covid_data(df) |
|
|
|
|
|
|
|
tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample) |
|
|
|
# remove users with too little data |
|
|
|
value_counts = df[user_str].value_counts() |
|
|
|
df = df[df[user_str].isin(value_counts[value_counts>1000].index)] |
|
|
|
|
|
|
|
adjusted_df = pd.DataFrame() |
|
|
|
# adjust labels |
|
|
|
new_id = 0 |
|
|
|
for user_id in df[user_str].unique(): |
|
|
|
user_data = df[df[user_str]==user_id] |
|
|
|
user_data[user_str] = new_id |
|
|
|
adjusted_df = pd.concat([adjusted_df, user_data], ignore_index=True) |
|
|
|
new_id += 1 |
|
|
|
|
|
|
|
# bin steps per hour TODO: adjust for minutes |
|
|
|
for hour in ['Hour_'+str(i) for i in range(24)]: |
|
|
|
hour_data = adjusted_df[hour] |
|
|
|
# smaller 1000 - round to 10 |
|
|
|
a = ((hour_data[hour_data<1000]/10).round()*10) |
|
|
|
# between 1000 and 10000 - round to next 100 |
|
|
|
b = ((hour_data[(hour_data>=1000)& (hour_data<10000)]/100).round()*100) |
|
|
|
# higher or equal 10000 - one class |
|
|
|
c = hour_data[hour_data > 10000] |
|
|
|
c = pd.Series(data={ind:10000 for ind in c.index}, index=c.index) |
|
|
|
new = pd.concat([a, b, c]).sort_index().astype(int) |
|
|
|
adjusted_df[hour] = new |
|
|
|
|
|
|
|
tr, val, te = split_data_by_userdata_percentage(adjusted_df, percentages=(70, 15, 15), sample=sample) |
|
|
|
tr = reduce_columns_v3(tr) |
|
|
|
val = reduce_columns_v3(val) |
|
|
|
te = reduce_columns_v3(te) |
|
|
|
|
|
|
|
|
|
|
|
scaler = MinMaxScaler() |
|
|
|
scaler.fit(tr.drop(columns=[user_str])) |
|
|
|
|
|
|
|
return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te) |
|
|
|
|
|
|
|
|
|
|
|
def scale_dataset(scaler, df): |
|
|
|
y = df[user_str] |
|
|
|
x_scaled = scaler.transform(df.drop(columns=[user_str])) |
|
|
|
|
|
|
|
df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1) |
|
|
|
df_scaled.columns = df.columns |
|
|
|
return prepare_user_data(df) |
|
|
|
return prepare_user_data(df_scaled) |
|
|
|
|
|
|
|
|
|
|
|
def calculate_baselines_v3(): |
|
|
|
@ -498,5 +561,6 @@ if __name__ == "__main__": |
|
|
|
#visualise_results_v2() |
|
|
|
#manual_tuning(model_type=model_type_lstm) |
|
|
|
#calculate_baselines() |
|
|
|
calculate_baselines_v3() |
|
|
|
print('Done') |
|
|
|
#calculate_baselines_v3() |
|
|
|
manual_tuning_v3(model_type=model_type_lstm) |
|
|
|
print('Done') # TODO: unterschiedlich große Datenmengen als ein Problem (auch in der Evaluation) |