import os import pandas as pd def process_single_file(file_path, user_label, interval='1H', threshold=None): """ Process a single step count CSV file into a pivoted daily activity DataFrame. Parameters ---------- file_path : str Path to the CSV file. user_label : int Unique label assigned to the user represented by this file. interval : str, optional Any valid pandas resampling interval (e.g., '1H', '15T', '30min', '5min'). threshold : float or None, optional Minimum step count value to include in aggregation. If None, all values are included (no filtering). Returns ------- pd.DataFrame A DataFrame where each row represents one day of activity with boolean indicators for each time interval, plus temporal and user info. """ # Load dataset with flexible column handling df = pd.read_csv(file_path, delimiter=';') # Ensure required columns exist required_cols = {'device', 'startDate', 'value'} if not required_cols.issubset(df.columns): raise ValueError(f"Missing required columns in {file_path}: {required_cols - set(df.columns)}") # Filter for iPhone devices (ignore NaN safely) iphone_df = df[df['device'].str.contains('iPhone', na=False)].copy() if iphone_df.empty: return pd.DataFrame() # Skip empty or invalid files # Convert startDate to datetime iphone_df['startDate'] = pd.to_datetime( iphone_df['startDate'], errors='coerce' ) iphone_df.dropna(subset=['startDate'], inplace=True) # Round down to the nearest interval dynamically iphone_df['interval_start'] = iphone_df['startDate'].dt.floor(interval) # Extract date and time components iphone_df['date'] = iphone_df['interval_start'].dt.date iphone_df['time'] = iphone_df['interval_start'].dt.time # Apply threshold filtering if specified if threshold is not None: iphone_df = iphone_df[iphone_df['value'] > threshold] # Group by date and time, summing step values within each interval interval_sum = ( iphone_df.groupby(['date', 'time'])['value'] .sum() .reset_index() ) # Generate a full time range based on the chosen interval full_time_range = pd.date_range('00:00', '23:59', freq=interval).time # Pivot to make one row per date, columns as time intervals pivot_table = interval_sum.pivot( index='date', columns='time', values='value' ).fillna(0) # Ensure all intervals exist even if missing in data pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) # Rename columns for clarity pivot_table.columns = [str(col) for col in pivot_table.columns] # Reset index to make 'date' a column again pivot_table.reset_index(inplace=True) # Add temporal features pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year # One-hot encode day of week pivot_table = pd.concat( [pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1 ) # Convert all time-interval columns to boolean (active or not) for col in pivot_table.columns[1:1 + len(full_time_range)]: pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) # Add user identifier pivot_table['user'] = user_label # Drop original DayOfWeek (we have the one-hot encoded version) pivot_table.drop(columns=['DayOfWeek'], inplace=True) return pivot_table def process_stepcount_files(input_folders, output_folder, files_to_skip=None, interval='1H', threshold=None): """ Process multiple step count CSV files from given folders into one aggregated Excel dataset. Parameters ---------- input_folders : list of str List of folders to scan recursively for CSV files. output_folder : str Folder path where the combined Excel file will be saved. files_to_skip : set or list of str, optional Filenames to ignore during processing. interval : str, optional Any valid pandas resampling interval. threshold : float or None, optional Minimum value for step count inclusion. If None, all values are used. Returns ------- pd.DataFrame Combined DataFrame containing all processed user data. """ # Ensure skip list is a set for fast lookup files_to_skip = set(files_to_skip or []) # Collect all CSV file paths file_paths = [] for folder in input_folders: for root, _, files in os.walk(folder): for fname in files: if fname.endswith('.csv') and fname not in files_to_skip: file_paths.append(os.path.join(root, fname)) # Assign user labels user_labels = list(range(len(file_paths))) # Process each file processed_dfs = [] for file_path, user_label in zip(file_paths, user_labels): df = process_single_file(file_path, user_label, interval, threshold) if not df.empty: processed_dfs.append(df) # Combine all processed data if not processed_dfs: raise ValueError("No valid data files found for processing.") combined_df = pd.concat(processed_dfs, ignore_index=True) # Create output filename dynamically threshold_label = ( f"threshold{int(threshold)}" if threshold is not None else "nothreshold" ) interval_label = interval.replace(' ', '').replace(':', '') output_filename = f"combined_aggregated_data_{interval_label}_{threshold_label}.xlsx" output_path = os.path.join(output_folder, output_filename) # Save to Excel os.makedirs(output_folder, exist_ok=True) combined_df.to_excel(output_path, index=False) return combined_df # Example usage: # combined_df = process_stepcount_files( # input_folders=['/path/to/data/folder'], # output_folder='/path/to/output/folder', # files_to_skip={'StepCount06.csv', 'StepCount10.csv'}, # interval='30T', # Any valid pandas frequency, e.g. '5T', '10T', '2H', etc. # threshold=25 # ) process_stepcount_files(["Step_Data_Project_India/Rest_of_the_World", "Step_Data_Project_India/Europe"], "Step_Data_Project_India/OuptutIndiaTest", interval="1H")