Browse Source

Minimal Code clean up

master
Bianca Steffes 2 weeks ago
parent
commit
1d14bc0c8d
  1. 1
      .gitignore
  2. 24
      main.py
  3. 28
      pipeline.py
  4. 5
      preprocessing_new.py

1
.gitignore

@ -144,3 +144,4 @@ working
figures
baseline_results.json
baseline_results_v3.json
results

24
main.py

@ -408,8 +408,8 @@ def upsampling(df):
def manual_tuning_v3(model_type):
# TODO: hrs/min + different sequence lengths
sequence_length = 7
# TODO: hrs/min
sequence_length = 1
tr, val, te = get_prepared_data_v3(dataset_hrs_path)
@ -417,7 +417,7 @@ def manual_tuning_v3(model_type):
# config
repeats = 3
n_batch = 1024
n_epochs = 200
n_epochs = 10
n_neurons = 256
n_neurons2 = 512
n_neurons3 = 512
@ -483,9 +483,9 @@ def get_prepared_data_v3(filename, sample=100):
df = pd.read_json(filename)
df = remove_covid_data(df)
# remove users with too little data
# remove users with too little data (optional)
value_counts = df[user_str].value_counts()
df = df[df[user_str].isin(value_counts[value_counts>1000].index)]
# df = df[df[user_str].isin(value_counts[value_counts>1000].index)]
adjusted_df = pd.DataFrame()
# adjust labels
@ -532,12 +532,12 @@ def scale_dataset(scaler, df):
def calculate_baselines_v3():
file_combinations = [(hour_timespan_str, dataset_hrs_path),
(min_timespan_str, dataset_min_path),
# (min_timespan_str, dataset_min_path), # TODO: dataset bining not ready for minutes
]
baseline_res = pd.DataFrame()
for timespan_id, filename in file_combinations:
_, _, te = get_prepared_data_v3(filename)
for sequence_length in range(5,30, 5):
for sequence_length in range(1,30,5):
x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length)
for strategy in ['most_frequent', 'stratified', 'uniform']:
@ -554,13 +554,19 @@ def calculate_baselines_v3():
print('Done')
if __name__ == "__main__":
# Ordner erstellen, die benötigt werden
create_dir('results/')
create_dir(figure_path)
# main_two_v1()
# visualise_results_v1()
#test(model_type=model_type_gru)
# main_two_v2(model_type=model_type_gru)
# main_two_v2(model_type=model_type_gru)
#visualise_results_v2()
#manual_tuning(model_type=model_type_lstm)
#calculate_baselines()
#### Ab hier aktuell (21.01.2026)
#calculate_baselines_v3()
manual_tuning_v3(model_type=model_type_lstm)
print('Done') # TODO: unterschiedlich große Datenmengen als ein Problem (auch in der Evaluation)
print('Done')

28
pipeline.py

@ -68,33 +68,34 @@ def prepare_user_data(df):
def make_sequences(data, sequence_length):
x, y = [], []
features = data.drop('user', axis=1).values
#features = features.astype(int)
labels = data['user'].values
for i in range(len(features) - sequence_length+1):
# for i in range(len(features) - sequence_length+1): # with overlap on days
for i in range(0, len(features) - sequence_length + 1, sequence_length): # without overlap on days
x.append(features[i:i + sequence_length])
y.append(labels[i + sequence_length-1])
return x, y
def prepare_data_for_model(user_data, sequence_length):
def prepare_data_for_model(user_data, sequence_length, print_counts=False):
x, y = [], []
combined = pd.DataFrame()
for user, data in user_data.items():
x_new, y_new = make_sequences(data, sequence_length)
x = x + x_new
y = y + y_new
if len(x_new)>0:
if print_counts and len(x_new)>0:
var = [[pd.DataFrame(a[s])for s in range(sequence_length)] for a in x_new ]
df_var = pd.concat([pd.concat(seq_list).T for seq_list in var])
df_var['user'] = user
combined = pd.concat([combined, df_var], ignore_index=True)
combined_ohne = combined.drop('user', axis=1)
print('Alle', len(combined))
print('Unique mit user', len(combined.drop_duplicates()))
print('Unique ohne user', len(combined_ohne.drop_duplicates()))
print('Unique')
print(combined.drop_duplicates()['user'].value_counts())
print('Alle')
print(combined['user'].value_counts())
if print_counts:
combined_ohne = combined.drop('user', axis=1)
print('Alle', len(combined))
print('Unique mit user', len(combined.drop_duplicates()))
print('Unique ohne user', len(combined_ohne.drop_duplicates()))
print('Unique')
print(combined.drop_duplicates()['user'].value_counts())
print('Alle')
print(combined['user'].value_counts())
random.Random(17).shuffle(x)
random.Random(17).shuffle(y)
x = np.array(x)
@ -239,11 +240,8 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons,n_neurons
# model.add(LSTM(n_neurons, kernel_regularizer=reg1, return_sequences=True))
model.add(LSTM(n_neurons))
# model.add(LSTM(n_neurons2))
# model.add(LSTM(n_neurons3, return_sequences=True))
# model.add(LSTM(n_neurons4))
if model_type == model_type_gru:
model.add(GRU(n_neurons))
# TODO: add another dense layer
#model.add(Dense(n_neurons, activation='relu'))
#model.add(Dropout(d1))
model.add(Dense(len(users), activation='softmax'))

5
preprocessing_new.py

@ -122,7 +122,7 @@ def process_file_15_min(file_path, user_label):
if __name__ == "__main__":
pd.options.mode.copy_on_write = True
# Generate file paths, skipping specified files
# Generate file paths
files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
+ ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])
@ -136,8 +136,7 @@ if __name__ == "__main__":
combined_df = pd.concat(processed_dfs, ignore_index=True)
# Save the combined DataFrame to a new Excel file
# Save the combined DataFrame to a new json file
combined_df.to_json(save_name, index=False)
user_counts = combined_df[user_str].value_counts()
print('Done')
Loading…
Cancel
Save