diff --git a/pipeline.py b/pipeline.py index bcd136a..5fc140a 100644 --- a/pipeline.py +++ b/pipeline.py @@ -77,10 +77,24 @@ def make_sequences(data, sequence_length): def prepare_data_for_model(user_data, sequence_length): x, y = [], [] + combined = pd.DataFrame() for user, data in user_data.items(): x_new, y_new = make_sequences(data, sequence_length) x = x + x_new y = y + y_new + if len(x_new)>0: + var = [[pd.DataFrame(a[s])for s in range(sequence_length)] for a in x_new ] + df_var = pd.concat([pd.concat(seq_list).T for seq_list in var]) + df_var['user'] = user + combined = pd.concat([combined, df_var], ignore_index=True) + combined_ohne = combined.drop('user', axis=1) + print('Alle', len(combined)) + print('Unique mit user', len(combined.drop_duplicates())) + print('Unique ohne user', len(combined_ohne.drop_duplicates())) + print('Unique') + print(combined.drop_duplicates()['user'].value_counts()) + print('Alle') + print(combined['user'].value_counts()) random.Random(17).shuffle(x) random.Random(17).shuffle(y) x = np.array(x)