Browse Source
Meine Versuche das Preprocessing in normale .py Dateien zu verwandeln. Funktioniert noch nicht.
master
Meine Versuche das Preprocessing in normale .py Dateien zu verwandeln. Funktioniert noch nicht.
master
2 changed files with 188 additions and 307 deletions
@ -1,335 +1,180 @@ |
|||||
import os |
import os |
||||
|
|
||||
import pandas as pd |
import pandas as pd |
||||
|
|
||||
def process_file_one_hour_no_threshold(file_path, user_label): |
|
||||
|
|
||||
# Load the dataset |
|
||||
|
def process_single_file(file_path, user_label, interval='1H', threshold=None): |
||||
|
""" |
||||
|
Process a single step count CSV file into a pivoted daily activity DataFrame. |
||||
|
|
||||
|
Parameters |
||||
|
---------- |
||||
|
file_path : str |
||||
|
Path to the CSV file. |
||||
|
user_label : int |
||||
|
Unique label assigned to the user represented by this file. |
||||
|
interval : str, optional |
||||
|
Any valid pandas resampling interval (e.g., '1H', '15T', '30min', '5min'). |
||||
|
threshold : float or None, optional |
||||
|
Minimum step count value to include in aggregation. |
||||
|
If None, all values are included (no filtering). |
||||
|
|
||||
|
Returns |
||||
|
------- |
||||
|
pd.DataFrame |
||||
|
A DataFrame where each row represents one day of activity with |
||||
|
boolean indicators for each time interval, plus temporal and user info. |
||||
|
""" |
||||
|
|
||||
|
# Load dataset with flexible column handling |
||||
df = pd.read_csv(file_path, delimiter=';') |
df = pd.read_csv(file_path, delimiter=';') |
||||
|
|
||||
# Step 1: Filter for iPhone devices |
|
||||
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False |
|
||||
|
|
||||
# Step 2: Select the desired columns |
|
||||
result = iphone_df[['startDate', 'endDate', 'value']] |
|
||||
|
|
||||
# Step 3: Convert startDate to datetime |
|
||||
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') |
|
||||
|
|
||||
# Step 4: Extract date and hour |
|
||||
iphone_df['date'] = iphone_df['startDate'].dt.date |
|
||||
iphone_df['hour'] = iphone_df['startDate'].dt.hour |
|
||||
|
# Ensure required columns exist |
||||
|
required_cols = {'device', 'startDate', 'value'} |
||||
|
if not required_cols.issubset(df.columns): |
||||
|
raise ValueError(f"Missing required columns in {file_path}: {required_cols - set(df.columns)}") |
||||
|
|
||||
# Step 5: Group by date and hour, then sum the values |
|
||||
hourly_sum = iphone_df.groupby(['date', 'hour'])['value'].sum().reset_index() |
|
||||
|
|
||||
# Step 6: Pivot the data to get one row per day with 24 columns for each hour |
|
||||
pivot_table = hourly_sum.pivot(index='date', columns='hour', values='value').fillna(0) |
|
||||
|
|
||||
# Step 7: Rename columns to reflect hours |
|
||||
pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns] |
|
||||
|
|
||||
# Step 8: Reset index to have 'date' as a column instead of index |
|
||||
pivot_table.reset_index(inplace=True) |
|
||||
|
|
||||
# Step 9: Add day of the week, month, and year columns |
|
||||
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() |
|
||||
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month |
|
||||
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year |
|
||||
|
|
||||
# Step 10: One-hot encode the 'DayOfWeek' column |
|
||||
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) |
|
||||
|
|
||||
# Step 11: Convert hourly values to binary (True if > 0, else False) |
|
||||
for col in pivot_table.columns[1:25]: # Skip the 'date' column and focus on hours |
|
||||
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) |
|
||||
|
|
||||
# Step 12: Add 'user' column with the specified user label |
|
||||
pivot_table['user'] = user_label |
|
||||
# Print which file is currently being processed |
|
||||
print(file_path,user_label) |
|
||||
# Step 13: Drop the 'DayOfWeek' column |
|
||||
pivot_table.drop(columns=['DayOfWeek'], inplace=True) |
|
||||
|
|
||||
return pivot_table |
|
||||
|
|
||||
# List of files to skip |
|
||||
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', |
|
||||
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv','StepCount27.csv', 'StepCount31.csv','StepCount32.csv', |
|
||||
'StepCount42.csv', 'StepCount46.csv'} |
|
||||
|
|
||||
# Generate file paths, skipping specified files |
|
||||
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) |
|
||||
if f'StepCount{i:02d}.csv' not in files_to_skip] |
|
||||
|
|
||||
# Generate user labels based on file index |
|
||||
user_labels = list(range(len(file_paths))) |
|
||||
|
|
||||
|
|
||||
# Process each file with its corresponding user label and concatenate the results |
|
||||
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] |
|
||||
combined_df = pd.concat(processed_dfs, ignore_index=True) |
|
||||
|
|
||||
# Save the combined DataFrame to a new Excel file |
|
||||
updated_file_path = '/content/combined_aggregated_data.xlsx' |
|
||||
combined_df.to_excel(updated_file_path, index=False) |
|
||||
|
|
||||
# Print the final DataFrame |
|
||||
print(combined_df) |
|
||||
|
|
||||
def process_file_15_min_no_threshold(file_path, user_label): |
|
||||
# Load the dataset |
|
||||
df = pd.read_csv(file_path, delimiter=';') |
|
||||
|
|
||||
# Filter for iPhone devices |
|
||||
iphone_df = df[df['device'].str.contains('iPhone', na=False)] |
|
||||
|
# Filter for iPhone devices (ignore NaN safely) |
||||
|
iphone_df = df[df['device'].str.contains('iPhone', na=False)].copy() |
||||
|
if iphone_df.empty: |
||||
|
return pd.DataFrame() # Skip empty or invalid files |
||||
|
|
||||
# Convert startDate to datetime |
# Convert startDate to datetime |
||||
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') |
|
||||
|
|
||||
# Round down the startDate to the nearest 15-minute interval |
|
||||
iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T') |
|
||||
|
|
||||
# Extract date, time, year, and month for 15-minute intervals |
|
||||
iphone_df['date'] = iphone_df['15min_interval'].dt.date |
|
||||
iphone_df['time'] = iphone_df['15min_interval'].dt.time |
|
||||
iphone_df['Year'] = iphone_df['15min_interval'].dt.year |
|
||||
iphone_df['Month'] = iphone_df['15min_interval'].dt.month |
|
||||
|
|
||||
# Group by date, time, year, and month, then sum the values |
|
||||
|
|
||||
|
|
||||
interval_sum = iphone_df.groupby(['date', 'time', 'Year', 'Month'])['value'].sum().reset_index() |
|
||||
|
|
||||
# Create a full range of 15-minute intervals (00:00:00 to 23:45:00) |
|
||||
full_time_range = pd.date_range('00:00', '23:45', freq='15T').time |
|
||||
|
|
||||
# Pivot the data to get one row per day with columns for each 15-minute interval |
|
||||
pivot_table = interval_sum.pivot(index=['date', 'Year', 'Month'], columns='time', values='value').fillna(0) |
|
||||
|
|
||||
# Reindex to include all possible 15-minute intervals |
|
||||
|
iphone_df['startDate'] = pd.to_datetime( |
||||
|
iphone_df['startDate'], errors='coerce' |
||||
|
) |
||||
|
iphone_df.dropna(subset=['startDate'], inplace=True) |
||||
|
|
||||
|
# Round down to the nearest interval dynamically |
||||
|
iphone_df['interval_start'] = iphone_df['startDate'].dt.floor(interval) |
||||
|
|
||||
|
# Extract date and time components |
||||
|
iphone_df['date'] = iphone_df['interval_start'].dt.date |
||||
|
iphone_df['time'] = iphone_df['interval_start'].dt.time |
||||
|
|
||||
|
# Apply threshold filtering if specified |
||||
|
if threshold is not None: |
||||
|
iphone_df = iphone_df[iphone_df['value'] > threshold] |
||||
|
|
||||
|
# Group by date and time, summing step values within each interval |
||||
|
interval_sum = ( |
||||
|
iphone_df.groupby(['date', 'time'])['value'] |
||||
|
.sum() |
||||
|
.reset_index() |
||||
|
) |
||||
|
|
||||
|
# Generate a full time range based on the chosen interval |
||||
|
full_time_range = pd.date_range('00:00', '23:59', freq=interval).time |
||||
|
|
||||
|
# Pivot to make one row per date, columns as time intervals |
||||
|
pivot_table = interval_sum.pivot( |
||||
|
index='date', columns='time', values='value' |
||||
|
).fillna(0) |
||||
|
|
||||
|
# Ensure all intervals exist even if missing in data |
||||
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) |
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) |
||||
|
|
||||
# Rename columns to reflect 15-minute intervals |
|
||||
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] |
|
||||
|
# Rename columns for clarity |
||||
|
pivot_table.columns = [str(col) for col in pivot_table.columns] |
||||
|
|
||||
# Convert interval values to boolean (True if > 0, else False) |
|
||||
pivot_table = pivot_table.apply(lambda col: col != 0, axis=0) |
|
||||
|
|
||||
# Reset index to have 'date', 'Year', and 'Month' as columns instead of index |
|
||||
|
# Reset index to make 'date' a column again |
||||
pivot_table.reset_index(inplace=True) |
pivot_table.reset_index(inplace=True) |
||||
|
|
||||
# Add day of the week |
|
||||
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() |
|
||||
|
|
||||
# One-hot encode the 'DayOfWeek' column |
|
||||
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) |
|
||||
|
|
||||
# Add a user column with the specified user label |
|
||||
pivot_table['user'] = user_label |
|
||||
|
|
||||
# Print which file is currently being processed |
|
||||
print(f"Processing file: {file_path}, User label: {user_label}") |
|
||||
|
|
||||
return pivot_table |
|
||||
|
|
||||
# List of files to skip |
|
||||
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', |
|
||||
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv', |
|
||||
'StepCount42.csv', 'StepCount46.csv'} |
|
||||
|
|
||||
# Generate file paths, skipping specified files |
|
||||
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) |
|
||||
if f'StepCount{i:02d}.csv' not in files_to_skip] |
|
||||
|
|
||||
# Generate user labels based on file index |
|
||||
user_labels = list(range(len(file_paths))) |
|
||||
|
|
||||
# Process each file with its corresponding user label and concatenate the results |
|
||||
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] |
|
||||
combined_df = pd.concat(processed_dfs, ignore_index=True) |
|
||||
|
|
||||
# Save the combined DataFrame to a new Excel file |
|
||||
updated_file_path = '/content/combined_aggregated_data_15min_without_threshold.xlsx' |
|
||||
combined_df.to_excel(updated_file_path, index=False) |
|
||||
|
|
||||
# Print the final DataFrame |
|
||||
print(combined_df) |
|
||||
|
|
||||
|
|
||||
user_counts = combined_df['user'].value_counts() |
|
||||
|
|
||||
# Display the count of each user |
|
||||
print(user_counts.sort_index()) |
|
||||
|
|
||||
def process_file_15_min_with_threshold(file_path, user_label): |
|
||||
# Load the dataset |
|
||||
df = pd.read_csv(file_path, delimiter=';') |
|
||||
|
|
||||
# Step 1: Filter for iPhone devices |
|
||||
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False |
|
||||
|
|
||||
# Step 2: Select the desired columns |
|
||||
result = iphone_df[['startDate', 'endDate', 'value']] |
|
||||
|
|
||||
# Step 3: Convert startDate to datetime |
|
||||
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') |
|
||||
|
|
||||
# Step 4: Round down the startDate to the nearest 15-minute interval |
|
||||
iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T') |
|
||||
|
|
||||
# Step 5: Extract date and time |
|
||||
iphone_df['date'] = iphone_df['15min_interval'].dt.date |
|
||||
iphone_df['time'] = iphone_df['15min_interval'].dt.time |
|
||||
|
|
||||
# Step 6: Group by date and time, then sum the values for 15-minute intervals |
|
||||
iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value']) |
|
||||
interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index() |
|
||||
|
|
||||
# Step 7: Pivot the data to get one row per day with columns for each 15-minute interval |
|
||||
pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0) |
|
||||
|
|
||||
# Step 8: Create a full range of 15-minute intervals (00:00:00 to 23:45:00) |
|
||||
full_time_range = pd.date_range('00:00', '23:45', freq='15T').time |
|
||||
|
|
||||
# Step 9: Reindex to include all possible 15-minute intervals and fill missing values with 0 |
|
||||
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) |
|
||||
|
|
||||
# Step 10: Rename columns to reflect 15-minute intervals |
|
||||
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] |
|
||||
|
|
||||
# Step 11: Reset index to have 'date' as a column instead of an index |
|
||||
pivot_table.reset_index(inplace=True) |
|
||||
|
|
||||
# Step 12: Add day of the week, month, and year columns |
|
||||
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() |
|
||||
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month |
|
||||
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year |
|
||||
|
|
||||
# Step 13: One-hot encode the 'DayOfWeek' column |
|
||||
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) |
|
||||
|
|
||||
# Step 14: Convert 15-minute interval values to binary (True if > 0, else False) |
|
||||
for col in pivot_table.columns[1:97]: # Skip the 'date' column and focus on 15-minute intervals |
|
||||
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) |
|
||||
|
|
||||
# Step 15: Add 'user' column with the specified user label |
|
||||
pivot_table['user'] = user_label |
|
||||
|
|
||||
# Print which file is currently being processed |
|
||||
print(f"Processing file: {file_path}, User label: {user_label}") |
|
||||
|
|
||||
# Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded |
|
||||
pivot_table.drop(columns=['DayOfWeek'], inplace=True) |
|
||||
|
|
||||
return pivot_table |
|
||||
|
|
||||
# List of files to skip |
|
||||
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', |
|
||||
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv', |
|
||||
'StepCount42.csv', 'StepCount46.csv'} |
|
||||
|
|
||||
# Generate file paths, skipping specified files |
|
||||
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) |
|
||||
if f'StepCount{i:02d}.csv' not in files_to_skip] |
|
||||
|
|
||||
# Generate user labels based on file index |
|
||||
user_labels = list(range(len(file_paths))) |
|
||||
|
|
||||
# Process each file with its corresponding user label and concatenate the results |
|
||||
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] |
|
||||
combined_df = pd.concat(processed_dfs, ignore_index=True) |
|
||||
|
|
||||
# Save the combined DataFrame to a new Excel file |
|
||||
updated_file_path = '/content/combined_aggregated_data_15min_with_threshold.xlsx' |
|
||||
combined_df.to_excel(updated_file_path, index=False) |
|
||||
|
|
||||
# Print the final DataFrame |
|
||||
print(combined_df) |
|
||||
|
|
||||
|
|
||||
def process_file_1_hour_with_threshold(file_path, user_label): |
|
||||
# Load the dataset |
|
||||
df = pd.read_csv(file_path, delimiter=';') |
|
||||
|
|
||||
# Step 1: Filter for iPhone devices |
|
||||
iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False |
|
||||
|
|
||||
# Step 2: Select the desired columns |
|
||||
result = iphone_df[['startDate', 'endDate', 'value']] |
|
||||
|
|
||||
# Step 3: Convert startDate to datetime |
|
||||
iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z') |
|
||||
|
|
||||
# Step 4: Round down the startDate to the nearest 1-hour interval |
|
||||
iphone_df['1hr_interval'] = iphone_df['startDate'].dt.floor('H') |
|
||||
|
|
||||
# Step 5: Extract date and time |
|
||||
iphone_df['date'] = iphone_df['1hr_interval'].dt.date |
|
||||
iphone_df['time'] = iphone_df['1hr_interval'].dt.time |
|
||||
|
|
||||
# Step 6: Group by date and time, then sum the values for 1-hour intervals |
|
||||
iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value']) |
|
||||
interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index() |
|
||||
|
|
||||
# Step 7: Pivot the data to get one row per day with columns for each 1-hour interval |
|
||||
pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0) |
|
||||
|
|
||||
# Step 8: Create a full range of 1-hour intervals (00:00:00 to 23:00:00) |
|
||||
full_time_range = pd.date_range('00:00', '23:00', freq='H').time |
|
||||
|
|
||||
# Step 9: Reindex to include all possible 1-hour intervals and fill missing values with 0 |
|
||||
pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0) |
|
||||
|
|
||||
# Step 10: Rename columns to reflect 1-hour intervals |
|
||||
pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns] |
|
||||
|
|
||||
# Step 11: Reset index to have 'date' as a column instead of an index |
|
||||
pivot_table.reset_index(inplace=True) |
|
||||
|
|
||||
# Step 12: Add day of the week, month, and year columns |
|
||||
|
# Add temporal features |
||||
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() |
pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name() |
||||
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month |
pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month |
||||
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year |
pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year |
||||
|
|
||||
# Step 13: One-hot encode the 'DayOfWeek' column |
|
||||
pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1) |
|
||||
|
# One-hot encode day of week |
||||
|
pivot_table = pd.concat( |
||||
|
[pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], |
||||
|
axis=1 |
||||
|
) |
||||
|
|
||||
# Step 14: Convert 1-hour interval values to binary (True if > 0, else False) |
|
||||
for col in pivot_table.columns[1:25]: # Skip the 'date' column and focus on 1-hour intervals |
|
||||
|
# Convert all time-interval columns to boolean (active or not) |
||||
|
for col in pivot_table.columns[1:1 + len(full_time_range)]: |
||||
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) |
pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False) |
||||
|
|
||||
# Step 15: Add 'user' column with the specified user label |
|
||||
|
# Add user identifier |
||||
pivot_table['user'] = user_label |
pivot_table['user'] = user_label |
||||
|
|
||||
# Print which file is currently being processed |
|
||||
print(f"Processing file: {file_path}, User label: {user_label}") |
|
||||
|
|
||||
# Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded |
|
||||
|
# Drop original DayOfWeek (we have the one-hot encoded version) |
||||
pivot_table.drop(columns=['DayOfWeek'], inplace=True) |
pivot_table.drop(columns=['DayOfWeek'], inplace=True) |
||||
|
|
||||
return pivot_table |
return pivot_table |
||||
|
|
||||
# List of files to skip |
|
||||
files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', |
|
||||
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv', |
|
||||
'StepCount42.csv', 'StepCount46.csv'} |
|
||||
|
|
||||
# Generate file paths, skipping specified files |
|
||||
file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47) |
|
||||
if f'StepCount{i:02d}.csv' not in files_to_skip] |
|
||||
|
|
||||
# Generate user labels based on file index |
|
||||
user_labels = list(range(len(file_paths))) |
|
||||
|
|
||||
# Process each file with its corresponding user label and concatenate the results |
|
||||
processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)] |
|
||||
combined_df = pd.concat(processed_dfs, ignore_index=True) |
|
||||
|
|
||||
# Save the combined DataFrame to a new Excel file |
|
||||
updated_file_path = '/content/combined_aggregated_data_1hr_withthreshold.xlsx' |
|
||||
combined_df.to_excel(updated_file_path, index=False) |
|
||||
|
|
||||
# Print the final DataFrame |
|
||||
print(combined_df) |
|
||||
|
|
||||
|
def process_stepcount_files(input_folders, output_folder, |
||||
|
files_to_skip=None, interval='1H', threshold=None): |
||||
|
""" |
||||
|
Process multiple step count CSV files from given folders into one aggregated Excel dataset. |
||||
|
|
||||
|
Parameters |
||||
|
---------- |
||||
|
input_folders : list of str |
||||
|
List of folders to scan recursively for CSV files. |
||||
|
output_folder : str |
||||
|
Folder path where the combined Excel file will be saved. |
||||
|
files_to_skip : set or list of str, optional |
||||
|
Filenames to ignore during processing. |
||||
|
interval : str, optional |
||||
|
Any valid pandas resampling interval. |
||||
|
threshold : float or None, optional |
||||
|
Minimum value for step count inclusion. If None, all values are used. |
||||
|
|
||||
|
Returns |
||||
|
------- |
||||
|
pd.DataFrame |
||||
|
Combined DataFrame containing all processed user data. |
||||
|
""" |
||||
|
|
||||
|
# Ensure skip list is a set for fast lookup |
||||
|
files_to_skip = set(files_to_skip or []) |
||||
|
|
||||
|
# Collect all CSV file paths |
||||
|
file_paths = [] |
||||
|
for folder in input_folders: |
||||
|
for root, _, files in os.walk(folder): |
||||
|
for fname in files: |
||||
|
if fname.endswith('.csv') and fname not in files_to_skip: |
||||
|
file_paths.append(os.path.join(root, fname)) |
||||
|
|
||||
|
# Assign user labels |
||||
|
user_labels = list(range(len(file_paths))) |
||||
|
|
||||
|
# Process each file |
||||
|
processed_dfs = [] |
||||
|
for file_path, user_label in zip(file_paths, user_labels): |
||||
|
df = process_single_file(file_path, user_label, interval, threshold) |
||||
|
if not df.empty: |
||||
|
processed_dfs.append(df) |
||||
|
|
||||
|
# Combine all processed data |
||||
|
if not processed_dfs: |
||||
|
raise ValueError("No valid data files found for processing.") |
||||
|
combined_df = pd.concat(processed_dfs, ignore_index=True) |
||||
|
|
||||
|
# Create output filename dynamically |
||||
|
threshold_label = ( |
||||
|
f"threshold{int(threshold)}" if threshold is not None else "nothreshold" |
||||
|
) |
||||
|
interval_label = interval.replace(' ', '').replace(':', '') |
||||
|
output_filename = f"combined_aggregated_data_{interval_label}_{threshold_label}.xlsx" |
||||
|
output_path = os.path.join(output_folder, output_filename) |
||||
|
|
||||
|
# Save to Excel |
||||
|
os.makedirs(output_folder, exist_ok=True) |
||||
|
combined_df.to_excel(output_path, index=False) |
||||
|
|
||||
|
return combined_df |
||||
|
|
||||
|
|
||||
|
# Example usage: |
||||
|
# combined_df = process_stepcount_files( |
||||
|
# input_folders=['/path/to/data/folder'], |
||||
|
# output_folder='/path/to/output/folder', |
||||
|
# files_to_skip={'StepCount06.csv', 'StepCount10.csv'}, |
||||
|
# interval='30T', # Any valid pandas frequency, e.g. '5T', '10T', '2H', etc. |
||||
|
# threshold=25 |
||||
|
# ) |
||||
|
|
||||
|
process_stepcount_files(["Step_Data_Project_India/Rest_of_the_World", "Step_Data_Project_India/Europe"], "Step_Data_Project_India/OuptutIndiaTest", interval="1H") |
||||
@ -0,0 +1,36 @@ |
|||||
|
import data_preprocessing |
||||
|
|
||||
|
# Example usage: |
||||
|
# combined_df = process_stepcount_files( |
||||
|
# input_folders=[ |
||||
|
# '/content/drive/My Drive/Data/iOS', |
||||
|
# '/content/drive/My Drive/Data/Watch' |
||||
|
# ], |
||||
|
# output_folder='/content/drive/My Drive/Data/Results', |
||||
|
# files_to_skip={ |
||||
|
# 'StepCount06.csv', 'StepCount10.csv', 'StepCount12.csv', |
||||
|
# 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', |
||||
|
# 'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', |
||||
|
# 'StepCount27.csv', 'StepCount31.csv', 'StepCount32.csv', |
||||
|
# 'StepCount42.csv', 'StepCount46.csv' |
||||
|
# }, |
||||
|
# interval='15T', # or '1H' |
||||
|
# threshold=25 # or None |
||||
|
# ) |
||||
|
|
||||
|
input_folders=[ |
||||
|
'Step_Data_Project_India/Europe/Europe', |
||||
|
'Step_Data_Project_India/Rest_of_the_World' |
||||
|
] |
||||
|
output_folder='Step_Data_Project_India/Preprocessing_Results' |
||||
|
files_to_skip={ |
||||
|
'StepCount06.csv', 'StepCount10.csv', 'StepCount12.csv', |
||||
|
'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv', |
||||
|
'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', |
||||
|
'StepCount27.csv', 'StepCount31.csv', 'StepCount32.csv', |
||||
|
'StepCount42.csv', 'StepCount46.csv' |
||||
|
} |
||||
|
interval='15T' |
||||
|
threshold=25 |
||||
|
|
||||
|
combined_df = data_preprocessing.process_stepcount_files(input_folders, output_folder, files_to_skip, interval, threshold) |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue