Browse Source

Switched preprocessing to not use boolean values but actual step counts

master
Bianca Steffes 1 week ago
parent
commit
05808cf8f3
  1. 1
      .gitignore
  2. 1
      Datasets/hours.json
  3. 1
      Datasets/minutes.json
  4. 50889
      Europe/Europe/StepCount46_52.csv
  5. 96
      main.py
  6. 4
      pipeline.py
  7. 143
      preprocessing_new.py

1
.gitignore

@ -143,3 +143,4 @@ working/tuner
working
figures
baseline_results.json
baseline_results_v3.json

1
Datasets/hours.json
File diff suppressed because it is too large
View File

1
Datasets/minutes.json
File diff suppressed because it is too large
View File

50889
Europe/Europe/StepCount46_52.csv
File diff suppressed because it is too large
View File

96
main.py

@ -8,6 +8,7 @@ from keras.src.regularizers import L1L2
from matplotlib import pyplot as plt
from pandas import DataFrame
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MinMaxScaler
from pipeline import (
load_dataset,
@ -22,6 +23,9 @@ from pipeline import (
year_str = 'Year'
month_str = 'Month'
date_str = 'Date'
time_str = 'Time'
day_of_week_str = 'DayOfWeek'
user_str = 'user'
split_str = 'split type'
data_split_str = 'data percentages'
@ -38,12 +42,14 @@ precision_str = 'precision'
recall_str = 'recall'
f1_string = 'f1 score'
model_type_str = 'model type'
weak_column_names = ['DayOfWeek_'+day for day in
week_column_names = ['DayOfWeek_' + day for day in
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ]]
figure_path = 'figures/'
# === Configurable Parameters ===
dataset_path = './Datasets/'
dataset_hrs_path = './Datasets/hours.json'
dataset_min_path = './Datasets/minutes.json'
DATA_PATH = dataset_path +'ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
OUTPUT_EXCEL_PATH = './working/evaluation_results.xlsx'
result_filename_v1 = './working/evaluation_results.json'
@ -73,7 +79,7 @@ def split_data_by_month_percentage(df, percentages):
tr, va, te = np.split(ids, [int((train_p/100) * len(ids)), int(((train_p + valid_p)/100) * len(ids))])
return df.merge(tr, on=[year_str, month_str], how='inner'), df.merge(va, on=[year_str, month_str], how='inner'), df.merge(te, on=[year_str, month_str], how='inner')
def split_data_by_userdata_percentage(df, percentages, sample):
def split_data_by_userdata_percentage(df, percentages, sample=100):
train_p, valid_p, test_p = percentages
tr, va, te = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for user_id in df[user_str].unique():
@ -119,10 +125,13 @@ def main():
def reduce_columns(df, filename):
if min_timespan_str in filename:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek']+weak_column_names, errors='ignore')
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'] + week_column_names, errors='ignore')
else:
return df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'], errors='ignore')
def reduce_columns_v3(df):
return df.drop(columns=[month_str, year_str, date_str])
def load_previous_results(filename):
results = pd.DataFrame()
@ -372,6 +381,37 @@ def manual_tuning(model_type):
print('Done')
def manual_tuning_v3(model_type):
# TODO: hrs/min + different sequence lengths
sequence_length = 20
tr, val, te = get_prepared_data_v3(dataset_hrs_path)
# fit and evaluate model
# config
repeats = 3
n_batch = 1024
n_epochs = 500
n_neurons = 16
l_rate = 1e-4
history_list = list()
# run diagnostic tests
for i in range(repeats):
history = train_one_model(tr, val, n_batch, n_epochs,
n_neurons, l_rate,
sequence_length=sequence_length,
model_type=model_type)
history_list.append(history)
for metric in ['p', 'r', 'f1']:
for history in history_list:
plt.plot(history['train_'+metric], color='blue')
plt.plot(history['test_'+metric], color='orange')
plt.savefig(figure_path+'v3/'+metric+'_e'+str(n_epochs)+'_n'+str(n_neurons)+'_b'+
str(n_batch)+'_l'+str(l_rate)+'_diagnostic.png')
plt.clf()
print('Done')
def calculate_baselines():
file_combinations = [(hour_timespan_str, with_threshold_str,'ALL32USERS1HR_WITHTHRESHOLD.xlsx'),
@ -404,6 +444,51 @@ def calculate_baselines():
baseline_res.to_json('baseline_results.json')
print('Done')
def get_prepared_data_v3(filename, sample=100):
df = pd.read_json(filename)
df = remove_covid_data(df)
tr, val, te = split_data_by_userdata_percentage(df, percentages=(80, 10, 10), sample=sample)
tr = reduce_columns_v3(tr)
val = reduce_columns_v3(val)
te = reduce_columns_v3(te)
scaler = MinMaxScaler()
scaler.fit(tr.drop(columns=[user_str]))
return scale_dataset(scaler, tr), scale_dataset(scaler, val), scale_dataset(scaler, te)
def scale_dataset(scaler, df):
y = df[user_str]
x_scaled = scaler.transform(df.drop(columns=[user_str]))
df_scaled = pd.concat([pd.DataFrame(x_scaled), pd.DataFrame(y)], axis=1)
df_scaled.columns = df.columns
return prepare_user_data(df)
def calculate_baselines_v3():
file_combinations = [(hour_timespan_str, dataset_hrs_path),
(min_timespan_str, dataset_min_path),
]
baseline_res = pd.DataFrame()
for timespan_id, filename in file_combinations:
_, _, te = get_prepared_data_v3(filename)
for sequence_length in range(5,30, 5):
x, y = prepare_data_for_model(user_data=te, sequence_length=sequence_length)
for strategy in ['most_frequent', 'stratified', 'uniform']:
cls = DummyClassifier(strategy=strategy)
cls.fit(x,y)
y_pred = cls.predict(x)
acc, p, r, f1 = eval_metrics(y_true=y, y_pred=y_pred)
baseline_res = pd.concat([baseline_res,
DataFrame({ 'strategy':[strategy],
timespan_str:[timespan_id], sequence_length_str:[sequence_length],
accuracy_str:[acc],precision_str:[p],recall_str:[r],
f1_string:f1})], ignore_index=True)
baseline_res.to_json('baseline_results_v3.json')
print('Done')
if __name__ == "__main__":
# main_two_v1()
@ -411,6 +496,7 @@ if __name__ == "__main__":
#test(model_type=model_type_gru)
# main_two_v2(model_type=model_type_gru)
#visualise_results_v2()
manual_tuning(model_type=model_type_lstm)
#manual_tuning(model_type=model_type_lstm)
#calculate_baselines()
calculate_baselines_v3()
print('Done')

4
pipeline.py

@ -209,7 +209,7 @@ def train_models_v2(user_data, user_data_val, sequence_length, model_type):
return tuner.get_best_models(num_models=1)[0]
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, reg, sequence_length, model_type):
def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate, sequence_length, model_type):
x, y = prepare_data_for_model(user_data=train_data, sequence_length=sequence_length)
n_features = x.shape[2]
users = list(train_data.keys())
@ -217,7 +217,7 @@ def train_one_model(train_data, val_data, n_batch, n_epochs, n_neurons, l_rate,
# prepare model
def build_model():
model = Sequential()
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch, bias_regularizer=reg))
model.add(Input(shape=(sequence_length, n_features), batch_size=n_batch))
# if model_type == model_type_bilstm:
# model.add(Bidirectional(units=units_hp))
if model_type == model_type_lstm:

143
preprocessing_new.py

@ -0,0 +1,143 @@
import os
import pandas as pd
from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \
week_column_names
def process_file_one_hour(file_path, user_label):
# Load the dataset
df = pd.read_csv(file_path, delimiter=';', low_memory=False)
# Filter for iPhone devices
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False
# Convert startDate to datetime
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
# Extract date and hour
hour_str = 'hour'
iphone_df[hour_str] = iphone_df['startDate'].dt.hour
iphone_df[date_str] = iphone_df['startDate'].dt.date
iphone_df[year_str] = iphone_df['startDate'].dt.year
iphone_df[month_str] = iphone_df['startDate'].dt.month
# Group by date and hour, then sum the values
hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index()
# Pivot the data to get one row per day with 24 columns for each hour
pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str],
columns=hour_str, values='value').fillna(0)
pivot_table = pivot_table.astype(int) # float because of the filled nas
# Rename columns to reflect hours
pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
all_hours = ['Hour_'+ str(i) for i in range(24)]
for hours in all_hours:
if hours not in pivot_table.columns:
pivot_table[hours] = 0
# Reset index
pivot_table.reset_index(inplace=True)
# Add day of the week, month, and year columns
pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
# One-hot encode the 'DayOfWeek' column
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
for week_day_col in week_column_names:
if week_day_col not in pivot_table.columns:
pivot_table[week_day_col] = 0
# Add 'user' column with the specified user label
pivot_table[user_str] = user_label
# Step 13: Drop the 'DayOfWeek' column
pivot_table.drop(columns=[day_of_week_str], inplace=True)
return pivot_table
def process_file_15_min(file_path, user_label):
interval_str = '15min_interval'
# Load the dataset
df = pd.read_csv(file_path, delimiter=';', low_memory=False)
# TODO: evtl. nicht nur iPhone date nutzen
# Filter for iPhone devices
iphone_df = df[df['device'].str.contains('iPhone', na=False)]
# Convert startDate to datetime
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
# Round down the startDate to the nearest 15-minute interval
iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min')
# Extract date, time, year, and month for 15-minute intervals
iphone_df[date_str] = iphone_df[interval_str].dt.date
iphone_df[time_str] = iphone_df[interval_str].dt.time
iphone_df[year_str] = iphone_df[interval_str].dt.year
iphone_df[month_str] = iphone_df[interval_str].dt.month
# Group by date, time, year, and month, then sum the values
interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index()
# Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
full_time_range = pd.date_range('00:00', '23:45', freq='15min').time
# Pivot the data to get one row per day with columns for each 15-minute interval
pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str,
values='value').fillna(0)
pivot_table = pivot_table.astype(int) # float because of the filled nas
# Reindex to include all possible 15-minute intervals
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
# Rename columns to reflect 15-minute intervals
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
# Reset index to have 'date', 'Year', and 'Month' as columns instead of index
pivot_table.reset_index(inplace=True)
# Add day of the week
pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
# One-hot encode the 'DayOfWeek' column
pivot_table = pd.concat(
[pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
for week_day_col in week_column_names:
if week_day_col not in pivot_table.columns:
pivot_table[week_day_col] = 0
# Add a user column with the specified user label
pivot_table[user_str] = user_label
pivot_table.drop(columns=[day_of_week_str], inplace=True)
return pivot_table
if __name__ == "__main__":
pd.options.mode.copy_on_write = True
# Generate file paths, skipping specified files
files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
+ ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])
# Generate user labels based on file index
user_labels = list(range(len(files)))
for save_name, process_func in [(dataset_hrs_path, process_file_one_hour),
(dataset_min_path, process_file_15_min)]:
# Process each file with its corresponding user label and concatenate the results
processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)]
combined_df = pd.concat(processed_dfs, ignore_index=True)
# Save the combined DataFrame to a new Excel file
combined_df.to_json(save_name, index=False)
user_counts = combined_df[user_str].value_counts()
print('Done')
Loading…
Cancel
Save