Minimal Code clean up

1 month ago · 1d14bc0c8d
4 changed files with 31 additions and 27 deletions
--- a/.gitignore
+++ b/.gitignore
@ -144,3 +144,4 @@ working
 figures
 baseline_results.json
 baseline_results_v3.json
+results
--- a/main.py
+++ b/main.py
@ -408,8 +408,8 @@ def upsampling(df):


 def manual_tuning_v3(model_type):
-    # TODO: hrs/min + different sequence lengths
-    sequence_length = 7
+    # TODO: hrs/min
+    sequence_length = 1

    tr, val, te = get_prepared_data_v3(dataset_hrs_path)

@ -417,7 +417,7 @@ def manual_tuning_v3(model_type):
    # config
    repeats = 3
    n_batch = 1024
-    n_epochs = 200
+    n_epochs = 10
    n_neurons = 256
    n_neurons2 = 512
    n_neurons3 = 512
@ -483,9 +483,9 @@ def get_prepared_data_v3(filename, sample=100):
    df = pd.read_json(filename)
    df = remove_covid_data(df)

-    # remove users with too little data
+    # remove users with too little data (optional)
    value_counts = df[user_str].value_counts()
-    df = df[df[user_str].isin(value_counts[value_counts>1000].index)]
+    # df = df[df[user_str].isin(value_counts[value_counts>1000].index)]

    adjusted_df = pd.DataFrame()
    # adjust labels
@ -532,12 +532,12 @@ def scale_dataset(scaler, df):

 def calculate_baselines_v3():
    file_combinations = [(hour_timespan_str, dataset_hrs_path),
-                         (min_timespan_str, dataset_min_path),
+                        # (min_timespan_str, dataset_min_path), # TODO: dataset bining not ready for minutes
                         ]
    baseline_res = pd.DataFrame()
    for timespan_id, filename in file_combinations:
        _, _, te = get_prepared_data_v3(filename)
-        for sequence_length in range(5,30, 5):
+        for sequence_length in range(1,30,5):
            x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length)

            for strategy in ['most_frequent', 'stratified', 'uniform']:
@ -554,13 +554,19 @@ def calculate_baselines_v3():
    print('Done')

 if __name__ == "__main__":
+    # Ordner erstellen, die benötigt werden
+    create_dir('results/')
+    create_dir(figure_path)
+
    # main_two_v1()
    # visualise_results_v1()
    #test(model_type=model_type_gru)
-   # main_two_v2(model_type=model_type_gru)
+    # main_two_v2(model_type=model_type_gru)
    #visualise_results_v2()
    #manual_tuning(model_type=model_type_lstm)
    #calculate_baselines()
+
+    #### Ab hier aktuell (21.01.2026)
    #calculate_baselines_v3()
    manual_tuning_v3(model_type=model_type_lstm)
-    print('Done') # TODO: unterschiedlich große Datenmengen als ein Problem (auch in der Evaluation)
+    print('Done')
--- a/pipeline.py
+++ b/pipeline.py
@ -68,33 +68,34 @@ def prepare_user_data(df):
 def make_sequences(data, sequence_length):
    x, y = [], []
    features = data.drop('user', axis=1).values
-    #features = features.astype(int)
    labels = data['user'].values
-    for i in range(len(features) - sequence_length+1):
+#    for i in range(len(features) - sequence_length+1): # with overlap on days
+    for i in range(0, len(features) - sequence_length + 1, sequence_length): # without overlap on days
        x.append(features[i:i + sequence_length])
        y.append(labels[i + sequence_length-1])
    return x, y

-def prepare_data_for_model(user_data, sequence_length):
+def prepare_data_for_model(user_data, sequence_length, print_counts=False):
    x, y = [], []
    combined = pd.DataFrame()
    for user, data in user_data.items():
        x_new, y_new = make_sequences(data, sequence_length)
        x = x + x_new
        y = y + y_new
-        if len(x_new)>0:
+        if print_counts and len(x_new)>0:
            var = [[pd.DataFrame(a[s])for s in range(sequence_length)] for a in x_new ]
            df_var = pd.concat([pd.concat(seq_list).T for seq_list in var])
            df_var['user'] = user
            combined = pd.concat([combined, df_var], ignore_index=True)
-    combined_ohne = combined.drop('user', axis=1)
-    print('Alle', len(combined))
-    print('Unique mit user', len(combined.drop_duplicates()))
-    print('Unique ohne user', len(combined_ohne.drop_duplicates()))
-    print('Unique')
-    print(combined.drop_duplicates()['user'].value_counts())
-    print('Alle')
-    print(combined['user'].value_counts())
+    if print_counts:
+        combined_ohne = combined.drop('user', axis=1)
+        print('Alle', len(combined))
+        print('Unique mit user', len(combined.drop_duplicates()))
+        print('Unique ohne user', len(combined_ohne.drop_duplicates()))
+        print('Unique')
+        print(combined.drop_duplicates()['user'].value_counts())
+        print('Alle')
+        print(combined['user'].value_counts())
    random.Random(17).shuffle(x)
    random.Random(17).shuffle(y)
    x = np.array(x)
@ -239,11 +240,8 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons,n_neurons
 #            model.add(LSTM(n_neurons, kernel_regularizer=reg1, return_sequences=True))
            model.add(LSTM(n_neurons))
      #      model.add(LSTM(n_neurons2))
-       #     model.add(LSTM(n_neurons3, return_sequences=True))
-        #    model.add(LSTM(n_neurons4))
        if model_type == model_type_gru:
            model.add(GRU(n_neurons))
-        # TODO: add another dense layer
        #model.add(Dense(n_neurons, activation='relu'))
        #model.add(Dropout(d1))
        model.add(Dense(len(users), activation='softmax'))
--- a/preprocessing_new.py
+++ b/preprocessing_new.py
@ -122,7 +122,7 @@ def process_file_15_min(file_path, user_label):

 if __name__ == "__main__":
    pd.options.mode.copy_on_write = True
-    # Generate file paths, skipping specified files
+    # Generate file paths
    files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
             + ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])

@ -136,8 +136,7 @@ if __name__ == "__main__":

        combined_df = pd.concat(processed_dfs, ignore_index=True)

-        # Save the combined DataFrame to a new Excel file
+        # Save the combined DataFrame to a new json file
        combined_df.to_json(save_name, index=False)
-        user_counts = combined_df[user_str].value_counts()

    print('Done')