7 changed files with 239 additions and 50896 deletions
-
1.gitignore
-
1Datasets/hours.json
-
1Datasets/minutes.json
-
50889Europe/Europe/StepCount46_52.csv
-
94main.py
-
4pipeline.py
-
143preprocessing_new.py
1
Datasets/hours.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
Datasets/minutes.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
50889
Europe/Europe/StepCount46_52.csv
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,143 @@ |
|||||
|
import os |
||||
|
|
||||
|
import pandas as pd |
||||
|
|
||||
|
from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \ |
||||
|
week_column_names |
||||
|
|
||||
|
|
||||
|
def process_file_one_hour(file_path, user_label): |
||||
|
# Load the dataset |
||||
|
df = pd.read_csv(file_path, delimiter=';', low_memory=False) |
||||
|
|
||||
|
# Filter for iPhone devices |
||||
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False |
||||
|
|
||||
|
# Convert startDate to datetime |
||||
|
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') |
||||
|
|
||||
|
# Extract date and hour |
||||
|
hour_str = 'hour' |
||||
|
iphone_df[hour_str] = iphone_df['startDate'].dt.hour |
||||
|
iphone_df[date_str] = iphone_df['startDate'].dt.date |
||||
|
iphone_df[year_str] = iphone_df['startDate'].dt.year |
||||
|
iphone_df[month_str] = iphone_df['startDate'].dt.month |
||||
|
|
||||
|
# Group by date and hour, then sum the values |
||||
|
hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index() |
||||
|
|
||||
|
# Pivot the data to get one row per day with 24 columns for each hour |
||||
|
pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str], |
||||
|
columns=hour_str, values='value').fillna(0) |
||||
|
|
||||
|
pivot_table = pivot_table.astype(int) # float because of the filled nas |
||||
|
|
||||
|
# Rename columns to reflect hours |
||||
|
pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns] |
||||
|
all_hours = ['Hour_'+ str(i) for i in range(24)] |
||||
|
for hours in all_hours: |
||||
|
if hours not in pivot_table.columns: |
||||
|
pivot_table[hours] = 0 |
||||
|
|
||||
|
# Reset index |
||||
|
pivot_table.reset_index(inplace=True) |
||||
|
|
||||
|
# Add day of the week, month, and year columns |
||||
|
pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name() |
||||
|
|
||||
|
# One-hot encode the 'DayOfWeek' column |
||||
|
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1) |
||||
|
for week_day_col in week_column_names: |
||||
|
if week_day_col not in pivot_table.columns: |
||||
|
pivot_table[week_day_col] = 0 |
||||
|
|
||||
|
# Add 'user' column with the specified user label |
||||
|
pivot_table[user_str] = user_label |
||||
|
|
||||
|
# Step 13: Drop the 'DayOfWeek' column |
||||
|
pivot_table.drop(columns=[day_of_week_str], inplace=True) |
||||
|
|
||||
|
return pivot_table |
||||
|
|
||||
|
|
||||
|
def process_file_15_min(file_path, user_label): |
||||
|
interval_str = '15min_interval' |
||||
|
|
||||
|
# Load the dataset |
||||
|
df = pd.read_csv(file_path, delimiter=';', low_memory=False) |
||||
|
|
||||
|
# TODO: evtl. nicht nur iPhone date nutzen |
||||
|
# Filter for iPhone devices |
||||
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)] |
||||
|
|
||||
|
# Convert startDate to datetime |
||||
|
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') |
||||
|
|
||||
|
# Round down the startDate to the nearest 15-minute interval |
||||
|
iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min') |
||||
|
|
||||
|
# Extract date, time, year, and month for 15-minute intervals |
||||
|
iphone_df[date_str] = iphone_df[interval_str].dt.date |
||||
|
iphone_df[time_str] = iphone_df[interval_str].dt.time |
||||
|
iphone_df[year_str] = iphone_df[interval_str].dt.year |
||||
|
iphone_df[month_str] = iphone_df[interval_str].dt.month |
||||
|
|
||||
|
# Group by date, time, year, and month, then sum the values |
||||
|
interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index() |
||||
|
|
||||
|
# Create a full range of 15-minute intervals (00:00:00 to 23:45:00) |
||||
|
full_time_range = pd.date_range('00:00', '23:45', freq='15min').time |
||||
|
|
||||
|
# Pivot the data to get one row per day with columns for each 15-minute interval |
||||
|
pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str, |
||||
|
values='value').fillna(0) |
||||
|
pivot_table = pivot_table.astype(int) # float because of the filled nas |
||||
|
|
||||
|
# Reindex to include all possible 15-minute intervals |
||||
|
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) |
||||
|
|
||||
|
# Rename columns to reflect 15-minute intervals |
||||
|
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] |
||||
|
|
||||
|
# Reset index to have 'date', 'Year', and 'Month' as columns instead of index |
||||
|
pivot_table.reset_index(inplace=True) |
||||
|
|
||||
|
# Add day of the week |
||||
|
pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name() |
||||
|
|
||||
|
# One-hot encode the 'DayOfWeek' column |
||||
|
pivot_table = pd.concat( |
||||
|
[pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1) |
||||
|
for week_day_col in week_column_names: |
||||
|
if week_day_col not in pivot_table.columns: |
||||
|
pivot_table[week_day_col] = 0 |
||||
|
|
||||
|
# Add a user column with the specified user label |
||||
|
pivot_table[user_str] = user_label |
||||
|
|
||||
|
pivot_table.drop(columns=[day_of_week_str], inplace=True) |
||||
|
|
||||
|
return pivot_table |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
pd.options.mode.copy_on_write = True |
||||
|
# Generate file paths, skipping specified files |
||||
|
files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')] |
||||
|
+ ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')]) |
||||
|
|
||||
|
# Generate user labels based on file index |
||||
|
user_labels = list(range(len(files))) |
||||
|
|
||||
|
for save_name, process_func in [(dataset_hrs_path, process_file_one_hour), |
||||
|
(dataset_min_path, process_file_15_min)]: |
||||
|
# Process each file with its corresponding user label and concatenate the results |
||||
|
processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)] |
||||
|
|
||||
|
combined_df = pd.concat(processed_dfs, ignore_index=True) |
||||
|
|
||||
|
# Save the combined DataFrame to a new Excel file |
||||
|
combined_df.to_json(save_name, index=False) |
||||
|
user_counts = combined_df[user_str].value_counts() |
||||
|
|
||||
|
print('Done') |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue