Browse Source

Switched preprocessing to not use boolean values but actual step counts

master
Bianca Steffes 2 months ago
parent
commit
05808cf8f3
  1. 1
      .gitignore
  2. 1
      Datasets/hours.json
  3. 1
      Datasets/minutes.json
  4. 50889
      Europe/Europe/StepCount46_52.csv
  5. 94
      main.py
  6. 4
      pipeline.py
  7. 143
      preprocessing_new.py

1
.gitignore

@ -143,3 +143,4 @@ working/tuner
working working
figures figures
baseline_results.json baseline_results.json
baseline_results_v3.json

1
Datasets/hours.json
File diff suppressed because it is too large
View File

1
Datasets/minutes.json
File diff suppressed because it is too large
View File

50889
Europe/Europe/StepCount46_52.csv
File diff suppressed because it is too large
View File

94
main.py

@ -8,6 +8,7 @@ from keras.src.regularizers import L1L2
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
from pandas import DataFrame from pandas import DataFrame
from sklearn.dummy import DummyClassifier from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MinMaxScaler
from pipeline import ( from pipeline import (
load_dataset, load_dataset,
@ -22,6 +23,9 @@ from pipeline import (
year_str = 'Year' year_str = 'Year'
month_str = 'Month' month_str = 'Month'
date_str = 'Date'
time_str = 'Time'
day_of_week_str = 'DayOfWeek'
user_str = 'user' user_str = 'user'
split_str = 'split type' split_str = 'split type'
data_split_str = 'data percentages' data_split_str = 'data percentages'
@ -38,12 +42,14 @@ precision_str = 'precision'
recall_str = 'recall' recall_str = 'recall'
f1_string = 'f1 score' f1_string = 'f1 score'
model_type_str = 'model type' model_type_str = 'model type'
weak_column_names = ['DayOfWeek_'+day for day in
week_column_names = ['DayOfWeek_' + day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]] ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
figure_path = 'figures/' figure_path = 'figures/'
# === Configurable Parameters === # === Configurable Parameters ===
dataset_path = './Datasets/' dataset_path = './Datasets/'
dataset_hrs_path = './Datasets/hours.json'
dataset_min_path = './Datasets/minutes.json'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx' DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx' OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
result_filename_v1 = './working/evaluation_results.json' result_filename_v1 = './working/evaluation_results.json'
@ -73,7 +79,7 @@ def split_data_by_month_percentage(df, percentages):
tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))]) tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner') return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')
def split_data_by_userdata_percentage(df, percentages, sample):
def split_data_by_userdata_percentage(df, percentages, sample=100):
train_p, valid_p, test_p = percentages train_p, valid_p, test_p = percentages
tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for user_id in df[user_str].unique(): for user_id in df[user_str].unique():
@ -119,11 +125,14 @@ def main():
def reduce_columns(df, filename): def reduce_columns(df, filename):
if min_timespan_str in filename: if min_timespan_str in filename:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'] + week_column_names, errors='ignore')
else: else:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore') return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
def reduce_columns_v3(df):
return df.drop(columns=[month_str, year_str, date_str])
def load_previous_results(filename): def load_previous_results(filename):
results = pd.DataFrame() results = pd.DataFrame()
if os.path.exists(filename): if os.path.exists(filename):
@ -372,6 +381,37 @@ def manual_tuning(model_type):
print('Done') print('Done')
def manual_tuning_v3(model_type):
# TODO: hrs/min + different sequence lengths
sequence_length = 20
tr, val, te = get_prepared_data_v3(dataset_hrs_path)
# fit and evaluate model
# config
repeats = 3
n_batch = 1024
n_epochs = 500
n_neurons = 16
l_rate = 1e-4
history_list = list()
# run diagnostic tests
for i in range(repeats):
history = train_one_model(tr, val, n_batch, n_epochs,
n_neurons, l_rate,
sequence_length=sequence_length,
model_type=model_type)
history_list.append(history)
for metric in ['p', 'r', 'f1']:
for history in history_list:
plt.plot(history['train_'+metric], color='blue')
plt.plot(history['test_'+metric], color='orange')
plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
plt.clf()
print('Done')
def calculate_baselines(): def calculate_baselines():
file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'), file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
@ -404,6 +444,51 @@ def calculate_baselines():
baseline_res.to_json('baseline_results.json') baseline_res.to_json('baseline_results.json')
print('Done') print('Done')
def get_prepared_data_v3(filename, sample=100):
df = pd.read_json(filename)
df = remove_covid_data(df)
tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample)
tr = reduce_columns_v3(tr)
val = reduce_columns_v3(val)
te = reduce_columns_v3(te)
scaler = MinMaxScaler()
scaler.fit(tr.drop(columns=[user_str]))
return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te)
def scale_dataset(scaler, df):
y = df[user_str]
x_scaled = scaler.transform(df.drop(columns=[user_str]))
df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1)
df_scaled.columns = df.columns
return prepare_user_data(df)
def calculate_baselines_v3():
file_combinations = [(hour_timespan_str, dataset_hrs_path),
(min_timespan_str, dataset_min_path),
]
baseline_res = pd.DataFrame()
for timespan_id, filename in file_combinations:
_, _, te = get_prepared_data_v3(filename)
for sequence_length in range(5,30, 5):
x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length)
for strategy in ['most_frequent', 'stratified', 'uniform']:
cls = DummyClassifier(strategy=strategy)
cls.fit(x,y)
y_pred = cls.predict(x)
acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred)
baseline_res = pd.concat([baseline_res,
DataFrame({ 'strategy':[strategy],
timespan_str:[timespan_id], sequence_length_str:[sequence_length],
accuracy_str:[acc],precision_str:[p],recall_str:[r],
f1_string:f1})], ignore_index=True)
baseline_res.to_json('baseline_results_v3.json')
print('Done')
if __name__ == "__main__": if __name__ == "__main__":
# main_two_v1() # main_two_v1()
@ -411,6 +496,7 @@ if __name__ == "__main__":
#test(model_type=model_type_gru) #test(model_type=model_type_gru)
# main_two_v2(model_type=model_type_gru) # main_two_v2(model_type=model_type_gru)
#visualise_results_v2() #visualise_results_v2()
manual_tuning(model_type=model_type_lstm)
#manual_tuning(model_type=model_type_lstm)
#calculate_baselines() #calculate_baselines()
calculate_baselines_v3()
print('Done') print('Done')

4
pipeline.py

@ -209,7 +209,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
return tuner.get_best_models(num_models=1)[0] return tuner.get_best_models(num_models=1)[0]
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length, model_type):
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, sequence_length, model_type):
x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length) x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
n_features = x.shape[2] n_features = x.shape[2]
users = list(train_data.keys()) users = list(train_data.keys())
@ -217,7 +217,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
# prepare model # prepare model
def build_model(): def build_model():
model = Sequential() model = Sequential()
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch, bias_regularizer=reg))
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
# if model_type == model_type_bilstm: # if model_type == model_type_bilstm:
# model.add(Bidirectional(units=units_hp)) # model.add(Bidirectional(units=units_hp))
if model_type == model_type_lstm: if model_type == model_type_lstm:

143
preprocessing_new.py

@ -0,0 +1,143 @@
import os
import pandas as pd
from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \
week_column_names
def process_file_one_hour(file_path, user_label):
# Load the dataset
df = pd.read_csv(file_path, delimiter=';', low_memory=False)
# Filter for iPhone devices
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False
# Convert startDate to datetime
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
# Extract date and hour
hour_str = 'hour'
iphone_df[hour_str] = iphone_df['startDate'].dt.hour
iphone_df[date_str] = iphone_df['startDate'].dt.date
iphone_df[year_str] = iphone_df['startDate'].dt.year
iphone_df[month_str] = iphone_df['startDate'].dt.month
# Group by date and hour, then sum the values
hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index()
# Pivot the data to get one row per day with 24 columns for each hour
pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str],
columns=hour_str, values='value').fillna(0)
pivot_table = pivot_table.astype(int) # float because of the filled nas
# Rename columns to reflect hours
pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
all_hours = ['Hour_'+ str(i) for i in range(24)]
for hours in all_hours:
if hours not in pivot_table.columns:
pivot_table[hours] = 0
# Reset index
pivot_table.reset_index(inplace=True)
# Add day of the week, month, and year columns
pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
# One-hot encode the 'DayOfWeek' column
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
for week_day_col in week_column_names:
if week_day_col not in pivot_table.columns:
pivot_table[week_day_col] = 0
# Add 'user' column with the specified user label
pivot_table[user_str] = user_label
# Step 13: Drop the 'DayOfWeek' column
pivot_table.drop(columns=[day_of_week_str], inplace=True)
return pivot_table
def process_file_15_min(file_path, user_label):
interval_str = '15min_interval'
# Load the dataset
df = pd.read_csv(file_path, delimiter=';', low_memory=False)
# TODO: evtl. nicht nur iPhone date nutzen
# Filter for iPhone devices
iphone_df = df[df['device'].str.contains('iPhone', na=False)]
# Convert startDate to datetime
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
# Round down the startDate to the nearest 15-minute interval
iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min')
# Extract date, time, year, and month for 15-minute intervals
iphone_df[date_str] = iphone_df[interval_str].dt.date
iphone_df[time_str] = iphone_df[interval_str].dt.time
iphone_df[year_str] = iphone_df[interval_str].dt.year
iphone_df[month_str] = iphone_df[interval_str].dt.month
# Group by date, time, year, and month, then sum the values
interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index()
# Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
full_time_range = pd.date_range('00:00', '23:45', freq='15min').time
# Pivot the data to get one row per day with columns for each 15-minute interval
pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str,
values='value').fillna(0)
pivot_table = pivot_table.astype(int) # float because of the filled nas
# Reindex to include all possible 15-minute intervals
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
# Rename columns to reflect 15-minute intervals
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
# Reset index to have 'date', 'Year', and 'Month' as columns instead of index
pivot_table.reset_index(inplace=True)
# Add day of the week
pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
# One-hot encode the 'DayOfWeek' column
pivot_table = pd.concat(
[pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
for week_day_col in week_column_names:
if week_day_col not in pivot_table.columns:
pivot_table[week_day_col] = 0
# Add a user column with the specified user label
pivot_table[user_str] = user_label
pivot_table.drop(columns=[day_of_week_str], inplace=True)
return pivot_table
if __name__ == "__main__":
pd.options.mode.copy_on_write = True
# Generate file paths, skipping specified files
files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
+ ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])
# Generate user labels based on file index
user_labels = list(range(len(files)))
for save_name, process_func in [(dataset_hrs_path, process_file_one_hour),
(dataset_min_path, process_file_15_min)]:
# Process each file with its corresponding user label and concatenate the results
processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)]
combined_df = pd.concat(processed_dfs, ignore_index=True)
# Save the combined DataFrame to a new Excel file
combined_df.to_json(save_name, index=False)
user_counts = combined_df[user_str].value_counts()
print('Done')
Loading…
Cancel
Save