Browse Source

Added more tests

master
Bianca Steffes 1 week ago
parent
commit
f20b852161
  1. 1
      .gitignore
  2. 1877
      figures/v1_results.svg
  3. 44
      main.py
  4. 34
      pipeline.py

1
.gitignore

@ -141,3 +141,4 @@ cython_debug/
.idea
working/tuner
working
figures

1877
figures/v1_results.svg
File diff suppressed because it is too large
View File

44
main.py

@ -68,11 +68,11 @@ def split_data_by_month_percentage(df, percentages):
tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')
def split_data_by_userdata_percentage(df, percentages):
def split_data_by_userdata_percentage(df, percentages, sample):
train_p, valid_p, test_p = percentages
tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for user_id in df[user_str].unique():
user_data = df[df[user_str]==user_id].sort_values([year_str, month_str])
user_data = df[df[user_str]==user_id].sample(frac=sample/ 100).sort_values([year_str, month_str])
u_tr, u_va, u_te = np.split(user_data, [int((train_p/100)*len(user_data)), int(((train_p+valid_p)/100)*len(user_data))])
tr = pd.concat([tr, u_tr], ignore_index=True)
va = pd.concat([va, u_va], ignore_index=True)
@ -285,9 +285,47 @@ def visualise_results_v2():
# Fazit: keine eindeutig besseren Versionen erkennbar
def test(model_type):
sequence_length = 20
data_filename = os.listdir(dataset_path)[0]
timespan_id = hour_timespan_str
threshold_id = with_threshold_str
file_path = os.path.join(dataset_path, data_filename)
df = load_dataset(file_path)
df = remove_covid_data(df)
results = pd.DataFrame()
for percentage in [33,66,100]:
print('Percentage:', percentage)
tr,val,te = split_data_by_userdata_percentage(df, percentages=(80,10,10),sample=percentage)
tr = reduce_columns(tr, data_filename)
val = reduce_columns(val, data_filename)
te = reduce_columns(te, data_filename)
user_data_train = prepare_user_data(tr)
user_data_val = prepare_user_data(val)
best_model = train_models_v2(user_data_train, user_data_val,
sequence_length=sequence_length,
model_type=model_type)
results = pd.concat([results,
evaluate_model_on_test_data(model=best_model,
test_df=te,
sequence_length=sequence_length,
time_span_id=timespan_id,
threshold_id=threshold_id,
model_type=model_type,
split_id=data_split_str)],
ignore_index=True)
print(results)
if __name__ == "__main__":
# main_two_v1()
# visualise_results_v1()
main_two_v2(model_type=model_type_gru)
test(model_type=model_type_gru)
# main_two_v2(model_type=model_type_gru)
#visualise_results_v2()
print('Done')

34
pipeline.py

@ -4,7 +4,7 @@ import pandas as pd
import shutil
from keras import Input
from keras.src.metrics import F1Score
from keras.src.metrics import F1Score, Precision, Recall, Accuracy
from pandas import ExcelWriter
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional,GRU
@ -13,7 +13,7 @@ from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras_tuner import RandomSearch
from sklearn.metrics import accuracy_score
epochs = 50
epochs = 5#50
model_type_gru = 'GRU'
model_type_lstm = 'LSTM'
model_type_bilstm = 'BiLSTM'
@ -57,14 +57,15 @@ def filter_test_data(df, scenario):
return pd.concat(data_parts, ignore_index=True)
def prepare_user_data(df):
df_sorted = df.sort_values(by='user').reset_index(drop=True)
users = df_sorted['user'].unique()
return {user: df_sorted[df_sorted['user'] == user] for user in users}
#df_sorted = df.sort_values(by='user').reset_index(drop=True)
users = df['user'].unique()
return {user: df[df['user'] == user] for user in users}
def prepare_data_for_model(user_data, sequence_length):
X, y = [], []
for user, data in user_data.items():
features = data.drop('user', axis=1).values
features = features.astype(int)
labels = data['user'].values
for i in range(len(features) - sequence_length):
X.append(features[i:i + sequence_length])
@ -144,9 +145,11 @@ def train_models(user_data, user_data_val, sequence_lengths, tuner_dir="./workin
# === Training & Validation ===
def train_models_v2(user_data, user_data_val, sequence_length, model_type):
tuner_dir = "./working/tuner/"+model_type
#val_metric = 'val_f1'
val_metric = 'val_precision'
early_stopping = EarlyStopping(monitor='val_f1', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_f1', factor=0.5, patience=2)
early_stopping = EarlyStopping(monitor=val_metric, patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor=val_metric, factor=0.5, patience=2)
shutil.rmtree(tuner_dir, ignore_errors=True)
@ -156,11 +159,12 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
n_features = x.shape[2]
users = list(user_data.keys())
y_val = np.array(y_val).reshape(-1, 1)
y = np.array(y).reshape(-1, 1)
#y_val = np.array(y_val).reshape(-1, 1)
#y = np.array(y).reshape(-1, 1)
def build_model(hp):
units_hp = hp.Int('units', 2, 256, step=2, sampling="log")
units_hp = hp.Int('units', 2, 8, step=2, sampling="log")
# units_hp = hp.Int('units', 2, 256, step=2, sampling="log")
model = Sequential()
model.add(Input((sequence_length, n_features)))
@ -170,18 +174,20 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
model.add(LSTM(units=units_hp))
if model_type==model_type_gru:
model.add(GRU(units=units_hp))
model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.2, step=0.1)))
model.add(Dense(len(users), activation='softmax'))
model.compile(
optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-5])),
loss='sparse_categorical_crossentropy',
metrics=[F1Score(name='f1', average='weighted')]
metrics=[#F1Score(name='f1', average='weighted'),
Precision(), #Recall(), Accuracy()
]
)
return model
tuner = RandomSearch(
build_model,
objective=keras_tuner.Objective("val_f1", direction="max"),
objective=keras_tuner.Objective(val_metric, direction="max"),
max_trials=120,
directory=tuner_dir,
)

Loading…
Cancel
Save