Switched preprocessing to not use boolean values but actual step counts

1 month ago · 05808cf8f3
7 changed files with 239 additions and 50896 deletions
--- a/.gitignore
+++ b/.gitignore
@ -143,3 +143,4 @@ working/tuner
 working
 figures
 baseline_results.json
+baseline_results_v3.json
--- a/Datasets/hours.json
+++ b/Datasets/hours.json
--- a/Datasets/minutes.json
+++ b/Datasets/minutes.json
--- a/Europe/Europe/StepCount46_52.csv
+++ b/Europe/Europe/StepCount46_52.csv
--- a/main.py
+++ b/main.py
@ -8,6 +8,7 @@ from keras.src.regularizers import L1L2
 from matplotlib import pyplot as plt
 from pandas import DataFrame
 from sklearn.dummy import DummyClassifier
+from sklearn.preprocessing import MinMaxScaler

 from pipeline import (
    load_dataset,
@ -22,6 +23,9 @@ from pipeline import (

 year_str = 'Year'
 month_str = 'Month'
+date_str = 'Date'
+time_str = 'Time'
+day_of_week_str = 'DayOfWeek'
 user_str = 'user'
 split_str = 'split type'
 data_split_str = 'data percentages'
@ -38,12 +42,14 @@ precision_str = 'precision'
 recall_str = 'recall'
 f1_string = 'f1 score'
 model_type_str = 'model type'
-weak_column_names = ['DayOfWeek_'+day for day in
+week_column_names = ['DayOfWeek_' + day for day in
                     ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
 figure_path = 'figures/'

 # === Configurable Parameters ===
 dataset_path = './Datasets/'
+dataset_hrs_path = './Datasets/hours.json'
+dataset_min_path = './Datasets/minutes.json'
 DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
 OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
 result_filename_v1 = './working/evaluation_results.json'
@ -73,7 +79,7 @@ def split_data_by_month_percentage(df, percentages):
    tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
    return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')

-def split_data_by_userdata_percentage(df, percentages, sample):
+def split_data_by_userdata_percentage(df, percentages, sample=100):
    train_p, valid_p, test_p = percentages
    tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for user_id in df[user_str].unique():
@ -119,10 +125,13 @@ def main():

 def reduce_columns(df, filename):
    if min_timespan_str in filename:
-        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
+        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'] + week_column_names, errors='ignore')
    else:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
-    
+
+
+def reduce_columns_v3(df):
+    return df.drop(columns=[month_str, year_str, date_str])

 def load_previous_results(filename):
    results = pd.DataFrame()
@ -372,6 +381,37 @@ def manual_tuning(model_type):
    print('Done')


+def manual_tuning_v3(model_type):
+    # TODO: hrs/min + different sequence lengths
+    sequence_length = 20
+
+    tr, val, te = get_prepared_data_v3(dataset_hrs_path)
+
+    # fit and evaluate model
+    # config
+    repeats = 3
+    n_batch = 1024
+    n_epochs = 500
+    n_neurons = 16
+    l_rate = 1e-4
+
+    history_list = list()
+    # run diagnostic tests
+    for i in range(repeats):
+        history = train_one_model(tr, val, n_batch, n_epochs,
+                                  n_neurons, l_rate, 
+                                     sequence_length=sequence_length,
+                                     model_type=model_type)
+        history_list.append(history)
+    for metric in ['p', 'r', 'f1']:
+        for history in history_list:
+            plt.plot(history['train_'+metric], color='blue')
+            plt.plot(history['test_'+metric], color='orange')
+        plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
+                    str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
+        plt.clf()
+    print('Done')
+

 def calculate_baselines():
    file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
@ -404,6 +444,51 @@ def calculate_baselines():
    baseline_res.to_json('baseline_results.json')
    print('Done')

+def get_prepared_data_v3(filename, sample=100):
+    df = pd.read_json(filename)
+    df = remove_covid_data(df)
+
+    tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample)
+    tr = reduce_columns_v3(tr)
+    val = reduce_columns_v3(val)
+    te = reduce_columns_v3(te)
+
+    scaler = MinMaxScaler()
+    scaler.fit(tr.drop(columns=[user_str]))
+
+    return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te)
+
+def scale_dataset(scaler, df):
+    y = df[user_str]
+    x_scaled = scaler.transform(df.drop(columns=[user_str]))
+
+    df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1)
+    df_scaled.columns = df.columns
+    return prepare_user_data(df)
+
+
+def calculate_baselines_v3():
+    file_combinations = [(hour_timespan_str, dataset_hrs_path),
+                         (min_timespan_str, dataset_min_path),
+                         ]
+    baseline_res = pd.DataFrame()
+    for timespan_id, filename in file_combinations:
+        _, _, te = get_prepared_data_v3(filename)
+        for sequence_length in range(5,30, 5):
+            x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length)
+
+            for strategy in ['most_frequent', 'stratified', 'uniform']:
+                cls = DummyClassifier(strategy=strategy)
+                cls.fit(x,y)
+                y_pred = cls.predict(x)
+                acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred)
+                baseline_res = pd.concat([baseline_res,
+                                          DataFrame({ 'strategy':[strategy],
+                                                      timespan_str:[timespan_id], sequence_length_str:[sequence_length],
+                                              accuracy_str:[acc],precision_str:[p],recall_str:[r],
+                                                     f1_string:f1})], ignore_index=True)
+    baseline_res.to_json('baseline_results_v3.json')
+    print('Done')

 if __name__ == "__main__":
    # main_two_v1()
@ -411,6 +496,7 @@ if __name__ == "__main__":
    #test(model_type=model_type_gru)
   # main_two_v2(model_type=model_type_gru)
    #visualise_results_v2()
-    manual_tuning(model_type=model_type_lstm)
+    #manual_tuning(model_type=model_type_lstm)
    #calculate_baselines()
+    calculate_baselines_v3()
    print('Done')
--- a/pipeline.py
+++ b/pipeline.py
@ -209,7 +209,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
    return tuner.get_best_models(num_models=1)[0]


-def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length, model_type):
+def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, sequence_length, model_type):
    x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
    n_features = x.shape[2]
    users = list(train_data.keys())
@ -217,7 +217,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
    # prepare model
    def build_model():
        model = Sequential()
-        model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch, bias_regularizer=reg))
+        model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
 #        if model_type == model_type_bilstm:
 #           model.add(Bidirectional(units=units_hp))
        if model_type == model_type_lstm:
--- a/preprocessing_new.py
+++ b/preprocessing_new.py
@ -0,0 +1,143 @@
+import os
+
+import pandas as pd
+
+from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \
+    week_column_names
+
+
+def process_file_one_hour(file_path, user_label):
+    # Load the dataset
+    df = pd.read_csv(file_path, delimiter=';', low_memory=False)
+
+    # Filter for iPhone devices
+    iphone_df = df[df['device'].str.contains('iPhone', na=False)]  # Treat NaN as False
+
+    # Convert startDate to datetime
+    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
+
+    # Extract date and hour
+    hour_str = 'hour'
+    iphone_df[hour_str] = iphone_df['startDate'].dt.hour
+    iphone_df[date_str] = iphone_df['startDate'].dt.date
+    iphone_df[year_str] = iphone_df['startDate'].dt.year
+    iphone_df[month_str] = iphone_df['startDate'].dt.month
+
+    # Group by date and hour, then sum the values
+    hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index()
+
+    # Pivot the data to get one row per day with 24 columns for each hour
+    pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str],
+                                   columns=hour_str, values='value').fillna(0)
+
+    pivot_table = pivot_table.astype(int) # float because of the filled nas
+
+    # Rename columns to reflect hours
+    pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
+    all_hours = ['Hour_'+ str(i) for i in range(24)]
+    for hours in all_hours:
+        if hours not in pivot_table.columns:
+            pivot_table[hours] = 0
+
+    # Reset index
+    pivot_table.reset_index(inplace=True)
+
+    # Add day of the week, month, and year columns
+    pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
+
+    # One-hot encode the 'DayOfWeek' column
+    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
+    for week_day_col in week_column_names:
+        if week_day_col not in pivot_table.columns:
+            pivot_table[week_day_col] = 0
+
+    # Add 'user' column with the specified user label
+    pivot_table[user_str] = user_label
+
+    # Step 13: Drop the 'DayOfWeek' column
+    pivot_table.drop(columns=[day_of_week_str], inplace=True)
+
+    return pivot_table
+
+
+def process_file_15_min(file_path, user_label):
+    interval_str = '15min_interval'
+
+    # Load the dataset
+    df = pd.read_csv(file_path, delimiter=';', low_memory=False)
+
+    # TODO: evtl. nicht nur iPhone date nutzen
+    # Filter for iPhone devices
+    iphone_df = df[df['device'].str.contains('iPhone', na=False)]
+
+    # Convert startDate to datetime
+    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
+
+    # Round down the startDate to the nearest 15-minute interval
+    iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min')
+
+    # Extract date, time, year, and month for 15-minute intervals
+    iphone_df[date_str] = iphone_df[interval_str].dt.date
+    iphone_df[time_str] = iphone_df[interval_str].dt.time
+    iphone_df[year_str] = iphone_df[interval_str].dt.year
+    iphone_df[month_str] = iphone_df[interval_str].dt.month
+
+    # Group by date, time, year, and month, then sum the values
+    interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index()
+
+    # Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
+    full_time_range = pd.date_range('00:00', '23:45', freq='15min').time
+
+    # Pivot the data to get one row per day with columns for each 15-minute interval
+    pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str,
+                                     values='value').fillna(0)
+    pivot_table = pivot_table.astype(int) # float because of the filled nas
+
+    # Reindex to include all possible 15-minute intervals
+    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
+
+    # Rename columns to reflect 15-minute intervals
+    pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
+
+    # Reset index to have 'date', 'Year', and 'Month' as columns instead of index
+    pivot_table.reset_index(inplace=True)
+
+    # Add day of the week
+    pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
+
+    # One-hot encode the 'DayOfWeek' column
+    pivot_table = pd.concat(
+        [pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
+    for week_day_col in week_column_names:
+        if week_day_col not in pivot_table.columns:
+            pivot_table[week_day_col] = 0
+
+    # Add a user column with the specified user label
+    pivot_table[user_str] = user_label
+
+    pivot_table.drop(columns=[day_of_week_str], inplace=True)
+
+    return pivot_table
+
+
+if __name__ == "__main__":
+    pd.options.mode.copy_on_write = True
+    # Generate file paths, skipping specified files
+    files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
+             + ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])
+
+    # Generate user labels based on file index
+    user_labels = list(range(len(files)))
+
+    for save_name, process_func in [(dataset_hrs_path, process_file_one_hour),
+                                    (dataset_min_path, process_file_15_min)]:
+        # Process each file with its corresponding user label and concatenate the results
+        processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)]
+
+        combined_df = pd.concat(processed_dfs, ignore_index=True)
+
+        # Save the combined DataFrame to a new Excel file
+        combined_df.to_json(save_name, index=False)
+        user_counts = combined_df[user_str].value_counts()
+
+    print('Done')