You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
335 lines
15 KiB
335 lines
15 KiB
import os
|
|
|
|
import pandas as pd
|
|
|
|
def process_file_one_hour_no_threshold(file_path, user_label):
|
|
|
|
# Load the dataset
|
|
df = pd.read_csv(file_path, delimiter=';')
|
|
|
|
# Step 1: Filter for iPhone devices
|
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False
|
|
|
|
# Step 2: Select the desired columns
|
|
result = iphone_df[['startDate', 'endDate', 'value']]
|
|
|
|
# Step 3: Convert startDate to datetime
|
|
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
|
|
|
|
# Step 4: Extract date and hour
|
|
iphone_df['date'] = iphone_df['startDate'].dt.date
|
|
iphone_df['hour'] = iphone_df['startDate'].dt.hour
|
|
|
|
# Step 5: Group by date and hour, then sum the values
|
|
hourly_sum = iphone_df.groupby(['date', 'hour'])['value'].sum().reset_index()
|
|
|
|
# Step 6: Pivot the data to get one row per day with 24 columns for each hour
|
|
pivot_table = hourly_sum.pivot(index='date', columns='hour', values='value').fillna(0)
|
|
|
|
# Step 7: Rename columns to reflect hours
|
|
pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
|
|
|
|
# Step 8: Reset index to have 'date' as a column instead of index
|
|
pivot_table.reset_index(inplace=True)
|
|
|
|
# Step 9: Add day of the week, month, and year columns
|
|
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
|
|
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
|
|
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year
|
|
|
|
# Step 10: One-hot encode the 'DayOfWeek' column
|
|
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
|
|
|
|
# Step 11: Convert hourly values to binary (True if > 0, else False)
|
|
for col in pivot_table.columns[1:25]: # Skip the 'date' column and focus on hours
|
|
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)
|
|
|
|
# Step 12: Add 'user' column with the specified user label
|
|
pivot_table['user'] = user_label
|
|
# Print which file is currently being processed
|
|
print(file_path,user_label)
|
|
# Step 13: Drop the 'DayOfWeek' column
|
|
pivot_table.drop(columns=['DayOfWeek'], inplace=True)
|
|
|
|
return pivot_table
|
|
|
|
# List of files to skip
|
|
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
|
|
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv','StepCount27.csv', 'StepCount31.csv','StepCount32.csv',
|
|
'StepCount42.csv', 'StepCount46.csv'}
|
|
|
|
# Generate file paths, skipping specified files
|
|
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
|
|
if f'StepCount{i:02d}.csv' not in files_to_skip]
|
|
|
|
# Generate user labels based on file index
|
|
user_labels = list(range(len(file_paths)))
|
|
|
|
|
|
# Process each file with its corresponding user label and concatenate the results
|
|
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
|
|
combined_df = pd.concat(processed_dfs, ignore_index=True)
|
|
|
|
# Save the combined DataFrame to a new Excel file
|
|
updated_file_path = '/content/combined_aggregated_data.xlsx'
|
|
combined_df.to_excel(updated_file_path, index=False)
|
|
|
|
# Print the final DataFrame
|
|
print(combined_df)
|
|
|
|
def process_file_15_min_no_threshold(file_path, user_label):
|
|
# Load the dataset
|
|
df = pd.read_csv(file_path, delimiter=';')
|
|
|
|
# Filter for iPhone devices
|
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)]
|
|
|
|
# Convert startDate to datetime
|
|
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
|
|
|
|
# Round down the startDate to the nearest 15-minute interval
|
|
iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T')
|
|
|
|
# Extract date, time, year, and month for 15-minute intervals
|
|
iphone_df['date'] = iphone_df['15min_interval'].dt.date
|
|
iphone_df['time'] = iphone_df['15min_interval'].dt.time
|
|
iphone_df['Year'] = iphone_df['15min_interval'].dt.year
|
|
iphone_df['Month'] = iphone_df['15min_interval'].dt.month
|
|
|
|
# Group by date, time, year, and month, then sum the values
|
|
|
|
|
|
interval_sum = iphone_df.groupby(['date', 'time', 'Year', 'Month'])['value'].sum().reset_index()
|
|
|
|
# Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
|
|
full_time_range = pd.date_range('00:00', '23:45', freq='15T').time
|
|
|
|
# Pivot the data to get one row per day with columns for each 15-minute interval
|
|
pivot_table = interval_sum.pivot(index=['date', 'Year', 'Month'], columns='time', values='value').fillna(0)
|
|
|
|
# Reindex to include all possible 15-minute intervals
|
|
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
|
|
|
|
# Rename columns to reflect 15-minute intervals
|
|
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
|
|
|
|
# Convert interval values to boolean (True if > 0, else False)
|
|
pivot_table = pivot_table.apply(lambda col: col != 0, axis=0)
|
|
|
|
# Reset index to have 'date', 'Year', and 'Month' as columns instead of index
|
|
pivot_table.reset_index(inplace=True)
|
|
|
|
# Add day of the week
|
|
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
|
|
|
|
# One-hot encode the 'DayOfWeek' column
|
|
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
|
|
|
|
# Add a user column with the specified user label
|
|
pivot_table['user'] = user_label
|
|
|
|
# Print which file is currently being processed
|
|
print(f"Processing file: {file_path}, User label: {user_label}")
|
|
|
|
return pivot_table
|
|
|
|
# List of files to skip
|
|
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
|
|
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv',
|
|
'StepCount42.csv', 'StepCount46.csv'}
|
|
|
|
# Generate file paths, skipping specified files
|
|
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
|
|
if f'StepCount{i:02d}.csv' not in files_to_skip]
|
|
|
|
# Generate user labels based on file index
|
|
user_labels = list(range(len(file_paths)))
|
|
|
|
# Process each file with its corresponding user label and concatenate the results
|
|
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
|
|
combined_df = pd.concat(processed_dfs, ignore_index=True)
|
|
|
|
# Save the combined DataFrame to a new Excel file
|
|
updated_file_path = '/content/combined_aggregated_data_15min_without_threshold.xlsx'
|
|
combined_df.to_excel(updated_file_path, index=False)
|
|
|
|
# Print the final DataFrame
|
|
print(combined_df)
|
|
|
|
|
|
user_counts = combined_df['user'].value_counts()
|
|
|
|
# Display the count of each user
|
|
print(user_counts.sort_index())
|
|
|
|
def process_file_15_min_with_threshold(file_path, user_label):
|
|
# Load the dataset
|
|
df = pd.read_csv(file_path, delimiter=';')
|
|
|
|
# Step 1: Filter for iPhone devices
|
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False
|
|
|
|
# Step 2: Select the desired columns
|
|
result = iphone_df[['startDate', 'endDate', 'value']]
|
|
|
|
# Step 3: Convert startDate to datetime
|
|
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
|
|
|
|
# Step 4: Round down the startDate to the nearest 15-minute interval
|
|
iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T')
|
|
|
|
# Step 5: Extract date and time
|
|
iphone_df['date'] = iphone_df['15min_interval'].dt.date
|
|
iphone_df['time'] = iphone_df['15min_interval'].dt.time
|
|
|
|
# Step 6: Group by date and time, then sum the values for 15-minute intervals
|
|
iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value'])
|
|
interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index()
|
|
|
|
# Step 7: Pivot the data to get one row per day with columns for each 15-minute interval
|
|
pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0)
|
|
|
|
# Step 8: Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
|
|
full_time_range = pd.date_range('00:00', '23:45', freq='15T').time
|
|
|
|
# Step 9: Reindex to include all possible 15-minute intervals and fill missing values with 0
|
|
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
|
|
|
|
# Step 10: Rename columns to reflect 15-minute intervals
|
|
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
|
|
|
|
# Step 11: Reset index to have 'date' as a column instead of an index
|
|
pivot_table.reset_index(inplace=True)
|
|
|
|
# Step 12: Add day of the week, month, and year columns
|
|
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
|
|
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
|
|
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year
|
|
|
|
# Step 13: One-hot encode the 'DayOfWeek' column
|
|
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
|
|
|
|
# Step 14: Convert 15-minute interval values to binary (True if > 0, else False)
|
|
for col in pivot_table.columns[1:97]: # Skip the 'date' column and focus on 15-minute intervals
|
|
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)
|
|
|
|
# Step 15: Add 'user' column with the specified user label
|
|
pivot_table['user'] = user_label
|
|
|
|
# Print which file is currently being processed
|
|
print(f"Processing file: {file_path}, User label: {user_label}")
|
|
|
|
# Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded
|
|
pivot_table.drop(columns=['DayOfWeek'], inplace=True)
|
|
|
|
return pivot_table
|
|
|
|
# List of files to skip
|
|
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
|
|
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv',
|
|
'StepCount42.csv', 'StepCount46.csv'}
|
|
|
|
# Generate file paths, skipping specified files
|
|
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
|
|
if f'StepCount{i:02d}.csv' not in files_to_skip]
|
|
|
|
# Generate user labels based on file index
|
|
user_labels = list(range(len(file_paths)))
|
|
|
|
# Process each file with its corresponding user label and concatenate the results
|
|
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
|
|
combined_df = pd.concat(processed_dfs, ignore_index=True)
|
|
|
|
# Save the combined DataFrame to a new Excel file
|
|
updated_file_path = '/content/combined_aggregated_data_15min_with_threshold.xlsx'
|
|
combined_df.to_excel(updated_file_path, index=False)
|
|
|
|
# Print the final DataFrame
|
|
print(combined_df)
|
|
|
|
|
|
def process_file_1_hour_with_threshold(file_path, user_label):
|
|
# Load the dataset
|
|
df = pd.read_csv(file_path, delimiter=';')
|
|
|
|
# Step 1: Filter for iPhone devices
|
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False
|
|
|
|
# Step 2: Select the desired columns
|
|
result = iphone_df[['startDate', 'endDate', 'value']]
|
|
|
|
# Step 3: Convert startDate to datetime
|
|
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
|
|
|
|
# Step 4: Round down the startDate to the nearest 1-hour interval
|
|
iphone_df['1hr_interval'] = iphone_df['startDate'].dt.floor('H')
|
|
|
|
# Step 5: Extract date and time
|
|
iphone_df['date'] = iphone_df['1hr_interval'].dt.date
|
|
iphone_df['time'] = iphone_df['1hr_interval'].dt.time
|
|
|
|
# Step 6: Group by date and time, then sum the values for 1-hour intervals
|
|
iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value'])
|
|
interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index()
|
|
|
|
# Step 7: Pivot the data to get one row per day with columns for each 1-hour interval
|
|
pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0)
|
|
|
|
# Step 8: Create a full range of 1-hour intervals (00:00:00 to 23:00:00)
|
|
full_time_range = pd.date_range('00:00', '23:00', freq='H').time
|
|
|
|
# Step 9: Reindex to include all possible 1-hour intervals and fill missing values with 0
|
|
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
|
|
|
|
# Step 10: Rename columns to reflect 1-hour intervals
|
|
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
|
|
|
|
# Step 11: Reset index to have 'date' as a column instead of an index
|
|
pivot_table.reset_index(inplace=True)
|
|
|
|
# Step 12: Add day of the week, month, and year columns
|
|
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
|
|
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
|
|
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year
|
|
|
|
# Step 13: One-hot encode the 'DayOfWeek' column
|
|
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
|
|
|
|
# Step 14: Convert 1-hour interval values to binary (True if > 0, else False)
|
|
for col in pivot_table.columns[1:25]: # Skip the 'date' column and focus on 1-hour intervals
|
|
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)
|
|
|
|
# Step 15: Add 'user' column with the specified user label
|
|
pivot_table['user'] = user_label
|
|
|
|
# Print which file is currently being processed
|
|
print(f"Processing file: {file_path}, User label: {user_label}")
|
|
|
|
# Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded
|
|
pivot_table.drop(columns=['DayOfWeek'], inplace=True)
|
|
|
|
return pivot_table
|
|
|
|
# List of files to skip
|
|
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
|
|
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv',
|
|
'StepCount42.csv', 'StepCount46.csv'}
|
|
|
|
# Generate file paths, skipping specified files
|
|
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
|
|
if f'StepCount{i:02d}.csv' not in files_to_skip]
|
|
|
|
# Generate user labels based on file index
|
|
user_labels = list(range(len(file_paths)))
|
|
|
|
# Process each file with its corresponding user label and concatenate the results
|
|
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
|
|
combined_df = pd.concat(processed_dfs, ignore_index=True)
|
|
|
|
# Save the combined DataFrame to a new Excel file
|
|
updated_file_path = '/content/combined_aggregated_data_1hr_withthreshold.xlsx'
|
|
combined_df.to_excel(updated_file_path, index=False)
|
|
|
|
# Print the final DataFrame
|
|
print(combined_df)
|
|
|