Switched preprocessing to not use boolean values but actual step counts

2 months ago · 05808cf8f3
7 changed files with 239 additions and 50896 deletions
--- a/.gitignore
+++ b/.gitignore
@ -143,3 +143,4 @@ working/tuner
 working
 figures
 baseline_results.json
 baseline_results_v3.json
--- a/Datasets/hours.json
+++ b/Datasets/hours.json
--- a/Datasets/minutes.json
+++ b/Datasets/minutes.json
--- a/Europe/Europe/StepCount46_52.csv
+++ b/Europe/Europe/StepCount46_52.csv
--- a/main.py
+++ b/main.py
@ -8,6 +8,7 @@ from keras.src.regularizers import L1L2
 from matplotlib import pyplot as plt
 from pandas import DataFrame
 from sklearn.dummy import DummyClassifier
 from sklearn.preprocessing import MinMaxScaler
 from pipeline import (
    load_dataset,
@ -22,6 +23,9 @@ from pipeline import (
 year_str = 'Year'
 month_str = 'Month'
 date_str = 'Date'
 time_str = 'Time'
 day_of_week_str = 'DayOfWeek'
 user_str = 'user'
 split_str = 'split type'
 data_split_str = 'data percentages'
@ -38,12 +42,14 @@ precision_str = 'precision'
 recall_str = 'recall'
 f1_string = 'f1 score'
 model_type_str = 'model type'
 weak_column_names = ['DayOfWeek_'+day for day in
 week_column_names = ['DayOfWeek_' + day for day in
                     ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
 figure_path = 'figures/'
 # === Configurable Parameters ===
 dataset_path = './Datasets/'
 dataset_hrs_path = './Datasets/hours.json'
 dataset_min_path = './Datasets/minutes.json'
 DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
 OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
 result_filename_v1 = './working/evaluation_results.json'
@ -73,7 +79,7 @@ def split_data_by_month_percentage(df, percentages):
    tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
    return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')
 def split_data_by_userdata_percentage(df, percentages, sample):
 def split_data_by_userdata_percentage(df, percentages, sample=100):
    train_p, valid_p, test_p = percentages
    tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for user_id in df[user_str].unique():
@ -119,11 +125,14 @@ def main():
 def reduce_columns(df, filename):
    if min_timespan_str in filename:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'] + week_column_names, errors='ignore')
    else:
        return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
 def reduce_columns_v3(df):
    return df.drop(columns=[month_str, year_str, date_str])
 def load_previous_results(filename):
    results = pd.DataFrame()
    if os.path.exists(filename):
@ -372,6 +381,37 @@ def manual_tuning(model_type):
    print('Done')
 def manual_tuning_v3(model_type):
    # TODO: hrs/min + different sequence lengths
    sequence_length = 20
    tr, val, te = get_prepared_data_v3(dataset_hrs_path)
    # fit and evaluate model
    # config
    repeats = 3
    n_batch = 1024
    n_epochs = 500
    n_neurons = 16
    l_rate = 1e-4
    history_list = list()
    # run diagnostic tests
    for i in range(repeats):
        history = train_one_model(tr, val, n_batch, n_epochs,
                                  n_neurons, l_rate, 
                                     sequence_length=sequence_length,
                                     model_type=model_type)
        history_list.append(history)
    for metric in ['p', 'r', 'f1']:
        for history in history_list:
            plt.plot(history['train_'+metric], color='blue')
            plt.plot(history['test_'+metric], color='orange')
        plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
                    str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
        plt.clf()
    print('Done')
 def calculate_baselines():
    file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
@ -404,6 +444,51 @@ def calculate_baselines():
    baseline_res.to_json('baseline_results.json')
    print('Done')
 def get_prepared_data_v3(filename, sample=100):
    df = pd.read_json(filename)
    df = remove_covid_data(df)
    tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample)
    tr = reduce_columns_v3(tr)
    val = reduce_columns_v3(val)
    te = reduce_columns_v3(te)
    scaler = MinMaxScaler()
    scaler.fit(tr.drop(columns=[user_str]))
    return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te)
 def scale_dataset(scaler, df):
    y = df[user_str]
    x_scaled = scaler.transform(df.drop(columns=[user_str]))
    df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1)
    df_scaled.columns = df.columns
    return prepare_user_data(df)
 def calculate_baselines_v3():
    file_combinations = [(hour_timespan_str, dataset_hrs_path),
                         (min_timespan_str, dataset_min_path),
                         ]
    baseline_res = pd.DataFrame()
    for timespan_id, filename in file_combinations:
        _, _, te = get_prepared_data_v3(filename)
        for sequence_length in range(5,30, 5):
            x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length)
            for strategy in ['most_frequent', 'stratified', 'uniform']:
                cls = DummyClassifier(strategy=strategy)
                cls.fit(x,y)
                y_pred = cls.predict(x)
                acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred)
                baseline_res = pd.concat([baseline_res,
                                          DataFrame({ 'strategy':[strategy],
                                                      timespan_str:[timespan_id], sequence_length_str:[sequence_length],
                                              accuracy_str:[acc],precision_str:[p],recall_str:[r],
                                                     f1_string:f1})], ignore_index=True)
    baseline_res.to_json('baseline_results_v3.json')
    print('Done')
 if __name__ == "__main__":
    # main_two_v1()
@ -411,6 +496,7 @@ if __name__ == "__main__":
    #test(model_type=model_type_gru)
   # main_two_v2(model_type=model_type_gru)
    #visualise_results_v2()
    manual_tuning(model_type=model_type_lstm)
    #manual_tuning(model_type=model_type_lstm)
    #calculate_baselines()
    calculate_baselines_v3()
    print('Done')
--- a/pipeline.py
+++ b/pipeline.py
@ -209,7 +209,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
    return tuner.get_best_models(num_models=1)[0]
 def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length, model_type):
 def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, sequence_length, model_type):
    x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
    n_features = x.shape[2]
    users = list(train_data.keys())
@ -217,7 +217,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
    # prepare model
    def build_model():
        model = Sequential()
        model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch, bias_regularizer=reg))
        model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
 #        if model_type == model_type_bilstm:
 #           model.add(Bidirectional(units=units_hp))
        if model_type == model_type_lstm:
--- a/preprocessing_new.py
+++ b/preprocessing_new.py
@ -0,0 +1,143 @@
 import os
 import pandas as pd
 from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \
    week_column_names
 def process_file_one_hour(file_path, user_label):
    # Load the dataset
    df = pd.read_csv(file_path, delimiter=';', low_memory=False)
    # Filter for iPhone devices
    iphone_df = df[df['device'].str.contains('iPhone', na=False)]  # Treat NaN as False
    # Convert startDate to datetime
    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
    # Extract date and hour
    hour_str = 'hour'
    iphone_df[hour_str] = iphone_df['startDate'].dt.hour
    iphone_df[date_str] = iphone_df['startDate'].dt.date
    iphone_df[year_str] = iphone_df['startDate'].dt.year
    iphone_df[month_str] = iphone_df['startDate'].dt.month
    # Group by date and hour, then sum the values
    hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index()
    # Pivot the data to get one row per day with 24 columns for each hour
    pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str],
                                   columns=hour_str, values='value').fillna(0)
    pivot_table = pivot_table.astype(int) # float because of the filled nas
    # Rename columns to reflect hours
    pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
    all_hours = ['Hour_'+ str(i) for i in range(24)]
    for hours in all_hours:
        if hours not in pivot_table.columns:
            pivot_table[hours] = 0
    # Reset index
    pivot_table.reset_index(inplace=True)
    # Add day of the week, month, and year columns
    pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
    # One-hot encode the 'DayOfWeek' column
    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
    for week_day_col in week_column_names:
        if week_day_col not in pivot_table.columns:
            pivot_table[week_day_col] = 0
    # Add 'user' column with the specified user label
    pivot_table[user_str] = user_label
    # Step 13: Drop the 'DayOfWeek' column
    pivot_table.drop(columns=[day_of_week_str], inplace=True)
    return pivot_table
 def process_file_15_min(file_path, user_label):
    interval_str = '15min_interval'
    # Load the dataset
    df = pd.read_csv(file_path, delimiter=';', low_memory=False)
    # TODO: evtl. nicht nur iPhone date nutzen
    # Filter for iPhone devices
    iphone_df = df[df['device'].str.contains('iPhone', na=False)]
    # Convert startDate to datetime
    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
    # Round down the startDate to the nearest 15-minute interval
    iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min')
    # Extract date, time, year, and month for 15-minute intervals
    iphone_df[date_str] = iphone_df[interval_str].dt.date
    iphone_df[time_str] = iphone_df[interval_str].dt.time
    iphone_df[year_str] = iphone_df[interval_str].dt.year
    iphone_df[month_str] = iphone_df[interval_str].dt.month
    # Group by date, time, year, and month, then sum the values
    interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index()
    # Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
    full_time_range = pd.date_range('00:00', '23:45', freq='15min').time
    # Pivot the data to get one row per day with columns for each 15-minute interval
    pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str,
                                     values='value').fillna(0)
    pivot_table = pivot_table.astype(int) # float because of the filled nas
    # Reindex to include all possible 15-minute intervals
    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
    # Rename columns to reflect 15-minute intervals
    pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
    # Reset index to have 'date', 'Year', and 'Month' as columns instead of index
    pivot_table.reset_index(inplace=True)
    # Add day of the week
    pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
    # One-hot encode the 'DayOfWeek' column
    pivot_table = pd.concat(
        [pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
    for week_day_col in week_column_names:
        if week_day_col not in pivot_table.columns:
            pivot_table[week_day_col] = 0
    # Add a user column with the specified user label
    pivot_table[user_str] = user_label
    pivot_table.drop(columns=[day_of_week_str], inplace=True)
    return pivot_table
 if __name__ == "__main__":
    pd.options.mode.copy_on_write = True
    # Generate file paths, skipping specified files
    files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
             + ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])
    # Generate user labels based on file index
    user_labels = list(range(len(files)))
    for save_name, process_func in [(dataset_hrs_path, process_file_one_hour),
                                    (dataset_min_path, process_file_15_min)]:
        # Process each file with its corresponding user label and concatenate the results
        processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)]
        combined_df = pd.concat(processed_dfs, ignore_index=True)
        # Save the combined DataFrame to a new Excel file
        combined_df.to_json(save_name, index=False)
        user_counts = combined_df[user_str].value_counts()
    print('Done')