diff --git a/data_preprocessing.py b/data_preprocessing.py index f765e0c..a0d5173 100644 --- a/data_preprocessing.py +++ b/data_preprocessing.py @@ -1,335 +1,180 @@ import os - import pandas as pd -def process_file_one_hour_no_threshold(file_path, user_label): - # Load the dataset +def process_single_file(file_path, user_label, interval='1H', threshold=None): + """ + Process a single step count CSV file into a pivoted daily activity DataFrame. + + Parameters + ---------- + file_path : str + Path to the CSV file. + user_label : int + Unique label assigned to the user represented by this file. + interval : str, optional + Any valid pandas resampling interval (e.g., '1H', '15T', '30min', '5min'). + threshold : float or None, optional + Minimum step count value to include in aggregation. + If None, all values are included (no filtering). + + Returns + ------- + pd.DataFrame + A DataFrame where each row represents one day of activity with + boolean indicators for each time interval, plus temporal and user info. + """ + + # Load dataset with flexible column handling df = pd.read_csv(file_path, delimiter=';') - # Step 1: Filter for iPhone devices - iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False - - # Step 2: Select the desired columns - result = iphone_df[['startDate', 'endDate', 'value']] - - # Step 3: Convert startDate to datetime - iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') - - # Step 4: Extract date and hour - iphone_df['date'] = iphone_df['startDate'].dt.date - iphone_df['hour'] = iphone_df['startDate'].dt.hour + # Ensure required columns exist + required_cols = {'device', 'startDate', 'value'} + if not required_cols.issubset(df.columns): + raise ValueError(f"Missing required columns in {file_path}: {required_cols - set(df.columns)}") - # Step 5: Group by date and hour, then sum the values - hourly_sum = iphone_df.groupby(['date', 'hour'])['value'].sum().reset_index() - - # Step 6: Pivot the data to get one row per day with 24 columns for each hour - pivot_table = hourly_sum.pivot(index='date', columns='hour', values='value').fillna(0) - - # Step 7: Rename columns to reflect hours - pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns] - - # Step 8: Reset index to have 'date' as a column instead of index - pivot_table.reset_index(inplace=True) - - # Step 9: Add day of the week, month, and year columns - pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() - pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month - pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year - - # Step 10: One-hot encode the 'DayOfWeek' column - pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) - - # Step 11: Convert hourly values to binary (True if > 0, else False) - for col in pivot_table.columns[1:25]: # Skip the 'date' column and focus on hours - pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) - - # Step 12: Add 'user' column with the specified user label - pivot_table['user'] = user_label - # Print which file is currently being processed - print(file_path,user_label) - # Step 13: Drop the 'DayOfWeek' column - pivot_table.drop(columns=['DayOfWeek'], inplace=True) - - return pivot_table - -# List of files to skip -files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', - 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv','StepCount27.csv', 'StepCount31.csv','StepCount32.csv', - 'StepCount42.csv', 'StepCount46.csv'} - -# Generate file paths, skipping specified files -file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) - if f'StepCount{i:02d}.csv' not in files_to_skip] - -# Generate user labels based on file index -user_labels = list(range(len(file_paths))) - - -# Process each file with its corresponding user label and concatenate the results -processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] -combined_df = pd.concat(processed_dfs, ignore_index=True) - -# Save the combined DataFrame to a new Excel file -updated_file_path = '/content/combined_aggregated_data.xlsx' -combined_df.to_excel(updated_file_path, index=False) - -# Print the final DataFrame -print(combined_df) - -def process_file_15_min_no_threshold(file_path, user_label): - # Load the dataset - df = pd.read_csv(file_path, delimiter=';') - - # Filter for iPhone devices - iphone_df = df[df['device'].str.contains('iPhone', na=False)] + # Filter for iPhone devices (ignore NaN safely) + iphone_df = df[df['device'].str.contains('iPhone', na=False)].copy() + if iphone_df.empty: + return pd.DataFrame() # Skip empty or invalid files # Convert startDate to datetime - iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') - - # Round down the startDate to the nearest 15-minute interval - iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T') - - # Extract date, time, year, and month for 15-minute intervals - iphone_df['date'] = iphone_df['15min_interval'].dt.date - iphone_df['time'] = iphone_df['15min_interval'].dt.time - iphone_df['Year'] = iphone_df['15min_interval'].dt.year - iphone_df['Month'] = iphone_df['15min_interval'].dt.month - - # Group by date, time, year, and month, then sum the values - - - interval_sum = iphone_df.groupby(['date', 'time', 'Year', 'Month'])['value'].sum().reset_index() - - # Create a full range of 15-minute intervals (00:00:00 to 23:45:00) - full_time_range = pd.date_range('00:00', '23:45', freq='15T').time - - # Pivot the data to get one row per day with columns for each 15-minute interval - pivot_table = interval_sum.pivot(index=['date', 'Year', 'Month'], columns='time', values='value').fillna(0) - - # Reindex to include all possible 15-minute intervals + iphone_df['startDate'] = pd.to_datetime( + iphone_df['startDate'], errors='coerce' + ) + iphone_df.dropna(subset=['startDate'], inplace=True) + + # Round down to the nearest interval dynamically + iphone_df['interval_start'] = iphone_df['startDate'].dt.floor(interval) + + # Extract date and time components + iphone_df['date'] = iphone_df['interval_start'].dt.date + iphone_df['time'] = iphone_df['interval_start'].dt.time + + # Apply threshold filtering if specified + if threshold is not None: + iphone_df = iphone_df[iphone_df['value'] > threshold] + + # Group by date and time, summing step values within each interval + interval_sum = ( + iphone_df.groupby(['date', 'time'])['value'] + .sum() + .reset_index() + ) + + # Generate a full time range based on the chosen interval + full_time_range = pd.date_range('00:00', '23:59', freq=interval).time + + # Pivot to make one row per date, columns as time intervals + pivot_table = interval_sum.pivot( + index='date', columns='time', values='value' + ).fillna(0) + + # Ensure all intervals exist even if missing in data pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) - # Rename columns to reflect 15-minute intervals - pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] + # Rename columns for clarity + pivot_table.columns = [str(col) for col in pivot_table.columns] - # Convert interval values to boolean (True if > 0, else False) - pivot_table = pivot_table.apply(lambda col: col != 0, axis=0) - - # Reset index to have 'date', 'Year', and 'Month' as columns instead of index + # Reset index to make 'date' a column again pivot_table.reset_index(inplace=True) - # Add day of the week - pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() - - # One-hot encode the 'DayOfWeek' column - pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) - - # Add a user column with the specified user label - pivot_table['user'] = user_label - - # Print which file is currently being processed - print(f"Processing file: {file_path}, User label: {user_label}") - - return pivot_table - -# List of files to skip -files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', - 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv', - 'StepCount42.csv', 'StepCount46.csv'} - -# Generate file paths, skipping specified files -file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) - if f'StepCount{i:02d}.csv' not in files_to_skip] - -# Generate user labels based on file index -user_labels = list(range(len(file_paths))) - -# Process each file with its corresponding user label and concatenate the results -processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] -combined_df = pd.concat(processed_dfs, ignore_index=True) - -# Save the combined DataFrame to a new Excel file -updated_file_path = '/content/combined_aggregated_data_15min_without_threshold.xlsx' -combined_df.to_excel(updated_file_path, index=False) - -# Print the final DataFrame -print(combined_df) - - -user_counts = combined_df['user'].value_counts() - -# Display the count of each user -print(user_counts.sort_index()) - -def process_file_15_min_with_threshold(file_path, user_label): - # Load the dataset - df = pd.read_csv(file_path, delimiter=';') - - # Step 1: Filter for iPhone devices - iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False - - # Step 2: Select the desired columns - result = iphone_df[['startDate', 'endDate', 'value']] - - # Step 3: Convert startDate to datetime - iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') - - # Step 4: Round down the startDate to the nearest 15-minute interval - iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T') - - # Step 5: Extract date and time - iphone_df['date'] = iphone_df['15min_interval'].dt.date - iphone_df['time'] = iphone_df['15min_interval'].dt.time - - # Step 6: Group by date and time, then sum the values for 15-minute intervals - iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value']) - interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index() - - # Step 7: Pivot the data to get one row per day with columns for each 15-minute interval - pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0) - - # Step 8: Create a full range of 15-minute intervals (00:00:00 to 23:45:00) - full_time_range = pd.date_range('00:00', '23:45', freq='15T').time - - # Step 9: Reindex to include all possible 15-minute intervals and fill missing values with 0 - pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) - - # Step 10: Rename columns to reflect 15-minute intervals - pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] - - # Step 11: Reset index to have 'date' as a column instead of an index - pivot_table.reset_index(inplace=True) - - # Step 12: Add day of the week, month, and year columns - pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() - pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month - pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year - - # Step 13: One-hot encode the 'DayOfWeek' column - pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) - - # Step 14: Convert 15-minute interval values to binary (True if > 0, else False) - for col in pivot_table.columns[1:97]: # Skip the 'date' column and focus on 15-minute intervals - pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) - - # Step 15: Add 'user' column with the specified user label - pivot_table['user'] = user_label - - # Print which file is currently being processed - print(f"Processing file: {file_path}, User label: {user_label}") - - # Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded - pivot_table.drop(columns=['DayOfWeek'], inplace=True) - - return pivot_table - -# List of files to skip -files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', - 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv', - 'StepCount42.csv', 'StepCount46.csv'} - -# Generate file paths, skipping specified files -file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) - if f'StepCount{i:02d}.csv' not in files_to_skip] - -# Generate user labels based on file index -user_labels = list(range(len(file_paths))) - -# Process each file with its corresponding user label and concatenate the results -processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] -combined_df = pd.concat(processed_dfs, ignore_index=True) - -# Save the combined DataFrame to a new Excel file -updated_file_path = '/content/combined_aggregated_data_15min_with_threshold.xlsx' -combined_df.to_excel(updated_file_path, index=False) - -# Print the final DataFrame -print(combined_df) - - -def process_file_1_hour_with_threshold(file_path, user_label): - # Load the dataset - df = pd.read_csv(file_path, delimiter=';') - - # Step 1: Filter for iPhone devices - iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False - - # Step 2: Select the desired columns - result = iphone_df[['startDate', 'endDate', 'value']] - - # Step 3: Convert startDate to datetime - iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') - - # Step 4: Round down the startDate to the nearest 1-hour interval - iphone_df['1hr_interval'] = iphone_df['startDate'].dt.floor('H') - - # Step 5: Extract date and time - iphone_df['date'] = iphone_df['1hr_interval'].dt.date - iphone_df['time'] = iphone_df['1hr_interval'].dt.time - - # Step 6: Group by date and time, then sum the values for 1-hour intervals - iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value']) - interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index() - - # Step 7: Pivot the data to get one row per day with columns for each 1-hour interval - pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0) - - # Step 8: Create a full range of 1-hour intervals (00:00:00 to 23:00:00) - full_time_range = pd.date_range('00:00', '23:00', freq='H').time - - # Step 9: Reindex to include all possible 1-hour intervals and fill missing values with 0 - pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) - - # Step 10: Rename columns to reflect 1-hour intervals - pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] - - # Step 11: Reset index to have 'date' as a column instead of an index - pivot_table.reset_index(inplace=True) - - # Step 12: Add day of the week, month, and year columns + # Add temporal features pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year - # Step 13: One-hot encode the 'DayOfWeek' column - pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) + # One-hot encode day of week + pivot_table = pd.concat( + [pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], + axis=1 + ) - # Step 14: Convert 1-hour interval values to binary (True if > 0, else False) - for col in pivot_table.columns[1:25]: # Skip the 'date' column and focus on 1-hour intervals + # Convert all time-interval columns to boolean (active or not) + for col in pivot_table.columns[1:1 + len(full_time_range)]: pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) - # Step 15: Add 'user' column with the specified user label + # Add user identifier pivot_table['user'] = user_label - # Print which file is currently being processed - print(f"Processing file: {file_path}, User label: {user_label}") - - # Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded + # Drop original DayOfWeek (we have the one-hot encoded version) pivot_table.drop(columns=['DayOfWeek'], inplace=True) return pivot_table -# List of files to skip -files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', - 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv', - 'StepCount42.csv', 'StepCount46.csv'} - -# Generate file paths, skipping specified files -file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) - if f'StepCount{i:02d}.csv' not in files_to_skip] - -# Generate user labels based on file index -user_labels = list(range(len(file_paths))) - -# Process each file with its corresponding user label and concatenate the results -processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] -combined_df = pd.concat(processed_dfs, ignore_index=True) - -# Save the combined DataFrame to a new Excel file -updated_file_path = '/content/combined_aggregated_data_1hr_withthreshold.xlsx' -combined_df.to_excel(updated_file_path, index=False) - -# Print the final DataFrame -print(combined_df) +def process_stepcount_files(input_folders, output_folder, + files_to_skip=None, interval='1H', threshold=None): + """ + Process multiple step count CSV files from given folders into one aggregated Excel dataset. + + Parameters + ---------- + input_folders : list of str + List of folders to scan recursively for CSV files. + output_folder : str + Folder path where the combined Excel file will be saved. + files_to_skip : set or list of str, optional + Filenames to ignore during processing. + interval : str, optional + Any valid pandas resampling interval. + threshold : float or None, optional + Minimum value for step count inclusion. If None, all values are used. + + Returns + ------- + pd.DataFrame + Combined DataFrame containing all processed user data. + """ + + # Ensure skip list is a set for fast lookup + files_to_skip = set(files_to_skip or []) + + # Collect all CSV file paths + file_paths = [] + for folder in input_folders: + for root, _, files in os.walk(folder): + for fname in files: + if fname.endswith('.csv') and fname not in files_to_skip: + file_paths.append(os.path.join(root, fname)) + + # Assign user labels + user_labels = list(range(len(file_paths))) + + # Process each file + processed_dfs = [] + for file_path, user_label in zip(file_paths, user_labels): + df = process_single_file(file_path, user_label, interval, threshold) + if not df.empty: + processed_dfs.append(df) + + # Combine all processed data + if not processed_dfs: + raise ValueError("No valid data files found for processing.") + combined_df = pd.concat(processed_dfs, ignore_index=True) + + # Create output filename dynamically + threshold_label = ( + f"threshold{int(threshold)}" if threshold is not None else "nothreshold" + ) + interval_label = interval.replace(' ', '').replace(':', '') + output_filename = f"combined_aggregated_data_{interval_label}_{threshold_label}.xlsx" + output_path = os.path.join(output_folder, output_filename) + + # Save to Excel + os.makedirs(output_folder, exist_ok=True) + combined_df.to_excel(output_path, index=False) + + return combined_df + + +# Example usage: +# combined_df = process_stepcount_files( +# input_folders=['/path/to/data/folder'], +# output_folder='/path/to/output/folder', +# files_to_skip={'StepCount06.csv', 'StepCount10.csv'}, +# interval='30T', # Any valid pandas frequency, e.g. '5T', '10T', '2H', etc. +# threshold=25 +# ) + +process_stepcount_files(["Step_Data_Project_India/Rest_of_the_World", "Step_Data_Project_India/Europe"], "Step_Data_Project_India/OuptutIndiaTest", interval="1H") \ No newline at end of file diff --git a/data_preprocessing_main.py b/data_preprocessing_main.py new file mode 100644 index 0000000..cd909dc --- /dev/null +++ b/data_preprocessing_main.py @@ -0,0 +1,36 @@ +import data_preprocessing + +# Example usage: +# combined_df = process_stepcount_files( +# input_folders=[ +# '/content/drive/My Drive/Data/iOS', +# '/content/drive/My Drive/Data/Watch' +# ], +# output_folder='/content/drive/My Drive/Data/Results', +# files_to_skip={ +# 'StepCount06.csv', 'StepCount10.csv', 'StepCount12.csv', +# 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', +# 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', +# 'StepCount27.csv', 'StepCount31.csv', 'StepCount32.csv', +# 'StepCount42.csv', 'StepCount46.csv' +# }, +# interval='15T', # or '1H' +# threshold=25 # or None +# ) + +input_folders=[ + 'Step_Data_Project_India/Europe/Europe', + 'Step_Data_Project_India/Rest_of_the_World' +] +output_folder='Step_Data_Project_India/Preprocessing_Results' +files_to_skip={ + 'StepCount06.csv', 'StepCount10.csv', 'StepCount12.csv', + 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', + 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', + 'StepCount27.csv', 'StepCount31.csv', 'StepCount32.csv', + 'StepCount42.csv', 'StepCount46.csv' +} +interval='15T' +threshold=25 + +combined_df = data_preprocessing.process_stepcount_files(input_folders, output_folder, files_to_skip, interval, threshold) \ No newline at end of file