diff --git a/.gitignore b/.gitignore index 6fabdce..9ff7e30 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,4 @@ working figures baseline_results.json baseline_results_v3.json +results diff --git a/main.py b/main.py index 6d7dc4b..cfdd27a 100644 --- a/main.py +++ b/main.py @@ -408,8 +408,8 @@ def upsampling(df): def manual_tuning_v3(model_type): - # TODO: hrs/min + different sequence lengths - sequence_length = 7 + # TODO: hrs/min + sequence_length = 1 tr, val, te = get_prepared_data_v3(dataset_hrs_path) @@ -417,7 +417,7 @@ def manual_tuning_v3(model_type): # config repeats = 3 n_batch = 1024 - n_epochs = 200 + n_epochs = 10 n_neurons = 256 n_neurons2 = 512 n_neurons3 = 512 @@ -483,9 +483,9 @@ def get_prepared_data_v3(filename, sample=100): df = pd.read_json(filename) df = remove_covid_data(df) - # remove users with too little data + # remove users with too little data (optional) value_counts = df[user_str].value_counts() - df = df[df[user_str].isin(value_counts[value_counts>1000].index)] + # df = df[df[user_str].isin(value_counts[value_counts>1000].index)] adjusted_df = pd.DataFrame() # adjust labels @@ -532,12 +532,12 @@ def scale_dataset(scaler, df): def calculate_baselines_v3(): file_combinations = [(hour_timespan_str, dataset_hrs_path), - (min_timespan_str, dataset_min_path), + # (min_timespan_str, dataset_min_path), # TODO: dataset bining not ready for minutes ] baseline_res = pd.DataFrame() for timespan_id, filename in file_combinations: _, _, te = get_prepared_data_v3(filename) - for sequence_length in range(5,30, 5): + for sequence_length in range(1,30,5): x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length) for strategy in ['most_frequent', 'stratified', 'uniform']: @@ -554,13 +554,19 @@ def calculate_baselines_v3(): print('Done') if __name__ == "__main__": + # Ordner erstellen, die benötigt werden + create_dir('results/') + create_dir(figure_path) + # main_two_v1() # visualise_results_v1() #test(model_type=model_type_gru) - # main_two_v2(model_type=model_type_gru) + # main_two_v2(model_type=model_type_gru) #visualise_results_v2() #manual_tuning(model_type=model_type_lstm) #calculate_baselines() + + #### Ab hier aktuell (21.01.2026) #calculate_baselines_v3() manual_tuning_v3(model_type=model_type_lstm) - print('Done') # TODO: unterschiedlich große Datenmengen als ein Problem (auch in der Evaluation) + print('Done') \ No newline at end of file diff --git a/pipeline.py b/pipeline.py index 5fc140a..728020d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -68,33 +68,34 @@ def prepare_user_data(df): def make_sequences(data, sequence_length): x, y = [], [] features = data.drop('user', axis=1).values - #features = features.astype(int) labels = data['user'].values - for i in range(len(features) - sequence_length+1): +# for i in range(len(features) - sequence_length+1): # with overlap on days + for i in range(0, len(features) - sequence_length + 1, sequence_length): # without overlap on days x.append(features[i:i + sequence_length]) y.append(labels[i + sequence_length-1]) return x, y -def prepare_data_for_model(user_data, sequence_length): +def prepare_data_for_model(user_data, sequence_length, print_counts=False): x, y = [], [] combined = pd.DataFrame() for user, data in user_data.items(): x_new, y_new = make_sequences(data, sequence_length) x = x + x_new y = y + y_new - if len(x_new)>0: + if print_counts and len(x_new)>0: var = [[pd.DataFrame(a[s])for s in range(sequence_length)] for a in x_new ] df_var = pd.concat([pd.concat(seq_list).T for seq_list in var]) df_var['user'] = user combined = pd.concat([combined, df_var], ignore_index=True) - combined_ohne = combined.drop('user', axis=1) - print('Alle', len(combined)) - print('Unique mit user', len(combined.drop_duplicates())) - print('Unique ohne user', len(combined_ohne.drop_duplicates())) - print('Unique') - print(combined.drop_duplicates()['user'].value_counts()) - print('Alle') - print(combined['user'].value_counts()) + if print_counts: + combined_ohne = combined.drop('user', axis=1) + print('Alle', len(combined)) + print('Unique mit user', len(combined.drop_duplicates())) + print('Unique ohne user', len(combined_ohne.drop_duplicates())) + print('Unique') + print(combined.drop_duplicates()['user'].value_counts()) + print('Alle') + print(combined['user'].value_counts()) random.Random(17).shuffle(x) random.Random(17).shuffle(y) x = np.array(x) @@ -239,11 +240,8 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons,n_neurons # model.add(LSTM(n_neurons, kernel_regularizer=reg1, return_sequences=True)) model.add(LSTM(n_neurons)) # model.add(LSTM(n_neurons2)) - # model.add(LSTM(n_neurons3, return_sequences=True)) - # model.add(LSTM(n_neurons4)) if model_type == model_type_gru: model.add(GRU(n_neurons)) - # TODO: add another dense layer #model.add(Dense(n_neurons, activation='relu')) #model.add(Dropout(d1)) model.add(Dense(len(users), activation='softmax')) diff --git a/preprocessing_new.py b/preprocessing_new.py index e7b52a0..07cd7dd 100644 --- a/preprocessing_new.py +++ b/preprocessing_new.py @@ -122,7 +122,7 @@ def process_file_15_min(file_path, user_label): if __name__ == "__main__": pd.options.mode.copy_on_write = True - # Generate file paths, skipping specified files + # Generate file paths files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')] + ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')]) @@ -136,8 +136,7 @@ if __name__ == "__main__": combined_df = pd.concat(processed_dfs, ignore_index=True) - # Save the combined DataFrame to a new Excel file + # Save the combined DataFrame to a new json file combined_df.to_json(save_name, index=False) - user_counts = combined_df[user_str].value_counts() print('Done') \ No newline at end of file