You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
180 lines
6.3 KiB
180 lines
6.3 KiB
import os
|
|
import pandas as pd
|
|
|
|
|
|
def process_single_file(file_path, user_label, interval='1H', threshold=None):
|
|
"""
|
|
Process a single step count CSV file into a pivoted daily activity DataFrame.
|
|
|
|
Parameters
|
|
----------
|
|
file_path : str
|
|
Path to the CSV file.
|
|
user_label : int
|
|
Unique label assigned to the user represented by this file.
|
|
interval : str, optional
|
|
Any valid pandas resampling interval (e.g., '1H', '15T', '30min', '5min').
|
|
threshold : float or None, optional
|
|
Minimum step count value to include in aggregation.
|
|
If None, all values are included (no filtering).
|
|
|
|
Returns
|
|
-------
|
|
pd.DataFrame
|
|
A DataFrame where each row represents one day of activity with
|
|
boolean indicators for each time interval, plus temporal and user info.
|
|
"""
|
|
|
|
# Load dataset with flexible column handling
|
|
df = pd.read_csv(file_path, delimiter=';')
|
|
|
|
# Ensure required columns exist
|
|
required_cols = {'device', 'startDate', 'value'}
|
|
if not required_cols.issubset(df.columns):
|
|
raise ValueError(f"Missing required columns in {file_path}: {required_cols - set(df.columns)}")
|
|
|
|
# Filter for iPhone devices (ignore NaN safely)
|
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)].copy()
|
|
if iphone_df.empty:
|
|
return pd.DataFrame() # Skip empty or invalid files
|
|
|
|
# Convert startDate to datetime
|
|
iphone_df['startDate'] = pd.to_datetime(
|
|
iphone_df['startDate'], errors='coerce'
|
|
)
|
|
iphone_df.dropna(subset=['startDate'], inplace=True)
|
|
|
|
# Round down to the nearest interval dynamically
|
|
iphone_df['interval_start'] = iphone_df['startDate'].dt.floor(interval)
|
|
|
|
# Extract date and time components
|
|
iphone_df['date'] = iphone_df['interval_start'].dt.date
|
|
iphone_df['time'] = iphone_df['interval_start'].dt.time
|
|
|
|
# Apply threshold filtering if specified
|
|
if threshold is not None:
|
|
iphone_df = iphone_df[iphone_df['value'] > threshold]
|
|
|
|
# Group by date and time, summing step values within each interval
|
|
interval_sum = (
|
|
iphone_df.groupby(['date', 'time'])['value']
|
|
.sum()
|
|
.reset_index()
|
|
)
|
|
|
|
# Generate a full time range based on the chosen interval
|
|
full_time_range = pd.date_range('00:00', '23:59', freq=interval).time
|
|
|
|
# Pivot to make one row per date, columns as time intervals
|
|
pivot_table = interval_sum.pivot(
|
|
index='date', columns='time', values='value'
|
|
).fillna(0)
|
|
|
|
# Ensure all intervals exist even if missing in data
|
|
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
|
|
|
|
# Rename columns for clarity
|
|
pivot_table.columns = [str(col) for col in pivot_table.columns]
|
|
|
|
# Reset index to make 'date' a column again
|
|
pivot_table.reset_index(inplace=True)
|
|
|
|
# Add temporal features
|
|
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
|
|
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
|
|
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year
|
|
|
|
# One-hot encode day of week
|
|
pivot_table = pd.concat(
|
|
[pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')],
|
|
axis=1
|
|
)
|
|
|
|
# Convert all time-interval columns to boolean (active or not)
|
|
for col in pivot_table.columns[1:1 + len(full_time_range)]:
|
|
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)
|
|
|
|
# Add user identifier
|
|
pivot_table['user'] = user_label
|
|
|
|
# Drop original DayOfWeek (we have the one-hot encoded version)
|
|
pivot_table.drop(columns=['DayOfWeek'], inplace=True)
|
|
|
|
return pivot_table
|
|
|
|
|
|
def process_stepcount_files(input_folders, output_folder,
|
|
files_to_skip=None, interval='1H', threshold=None):
|
|
"""
|
|
Process multiple step count CSV files from given folders into one aggregated Excel dataset.
|
|
|
|
Parameters
|
|
----------
|
|
input_folders : list of str
|
|
List of folders to scan recursively for CSV files.
|
|
output_folder : str
|
|
Folder path where the combined Excel file will be saved.
|
|
files_to_skip : set or list of str, optional
|
|
Filenames to ignore during processing.
|
|
interval : str, optional
|
|
Any valid pandas resampling interval.
|
|
threshold : float or None, optional
|
|
Minimum value for step count inclusion. If None, all values are used.
|
|
|
|
Returns
|
|
-------
|
|
pd.DataFrame
|
|
Combined DataFrame containing all processed user data.
|
|
"""
|
|
|
|
# Ensure skip list is a set for fast lookup
|
|
files_to_skip = set(files_to_skip or [])
|
|
|
|
# Collect all CSV file paths
|
|
file_paths = []
|
|
for folder in input_folders:
|
|
for root, _, files in os.walk(folder):
|
|
for fname in files:
|
|
if fname.endswith('.csv') and fname not in files_to_skip:
|
|
file_paths.append(os.path.join(root, fname))
|
|
|
|
# Assign user labels
|
|
user_labels = list(range(len(file_paths)))
|
|
|
|
# Process each file
|
|
processed_dfs = []
|
|
for file_path, user_label in zip(file_paths, user_labels):
|
|
df = process_single_file(file_path, user_label, interval, threshold)
|
|
if not df.empty:
|
|
processed_dfs.append(df)
|
|
|
|
# Combine all processed data
|
|
if not processed_dfs:
|
|
raise ValueError("No valid data files found for processing.")
|
|
combined_df = pd.concat(processed_dfs, ignore_index=True)
|
|
|
|
# Create output filename dynamically
|
|
threshold_label = (
|
|
f"threshold{int(threshold)}" if threshold is not None else "nothreshold"
|
|
)
|
|
interval_label = interval.replace(' ', '').replace(':', '')
|
|
output_filename = f"combined_aggregated_data_{interval_label}_{threshold_label}.xlsx"
|
|
output_path = os.path.join(output_folder, output_filename)
|
|
|
|
# Save to Excel
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
combined_df.to_excel(output_path, index=False)
|
|
|
|
return combined_df
|
|
|
|
|
|
# Example usage:
|
|
# combined_df = process_stepcount_files(
|
|
# input_folders=['/path/to/data/folder'],
|
|
# output_folder='/path/to/output/folder',
|
|
# files_to_skip={'StepCount06.csv', 'StepCount10.csv'},
|
|
# interval='30T', # Any valid pandas frequency, e.g. '5T', '10T', '2H', etc.
|
|
# threshold=25
|
|
# )
|
|
|
|
process_stepcount_files(["Step_Data_Project_India/Rest_of_the_World", "Step_Data_Project_India/Europe"], "Step_Data_Project_India/OuptutIndiaTest", interval="1H")
|