Meine Versuche das Preprocessing in normale .py Dateien zu verwandeln. Funktioniert noch nicht.

3 months ago · b9fb1128c7
2 changed files with 188 additions and 307 deletions
--- a/data_preprocessing.py
+++ b/data_preprocessing.py
@ -1,335 +1,180 @@
 import os
-
 import pandas as pd

-def process_file_one_hour_no_threshold(file_path, user_label):

-    # Load the dataset
+def process_single_file(file_path, user_label, interval='1H', threshold=None):
+    """
+    Process a single step count CSV file into a pivoted daily activity DataFrame.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the CSV file.
+    user_label : int
+        Unique label assigned to the user represented by this file.
+    interval : str, optional
+        Any valid pandas resampling interval (e.g., '1H', '15T', '30min', '5min').
+    threshold : float or None, optional
+        Minimum step count value to include in aggregation.
+        If None, all values are included (no filtering).
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame where each row represents one day of activity with
+        boolean indicators for each time interval, plus temporal and user info.
+    """
+
+    # Load dataset with flexible column handling
    df = pd.read_csv(file_path, delimiter=';')

-    # Step 1: Filter for iPhone devices
-    iphone_df = df[df['device'].str.contains('iPhone', na=False)]  # Treat NaN as False
-
-    # Step 2: Select the desired columns
-    result = iphone_df[['startDate', 'endDate', 'value']]
-
-    # Step 3: Convert startDate to datetime
-    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
-
-    # Step 4: Extract date and hour
-    iphone_df['date'] = iphone_df['startDate'].dt.date
-    iphone_df['hour'] = iphone_df['startDate'].dt.hour
+    # Ensure required columns exist
+    required_cols = {'device', 'startDate', 'value'}
+    if not required_cols.issubset(df.columns):
+        raise ValueError(f"Missing required columns in {file_path}: {required_cols - set(df.columns)}")

-    # Step 5: Group by date and hour, then sum the values
-    hourly_sum = iphone_df.groupby(['date', 'hour'])['value'].sum().reset_index()
-
-    # Step 6: Pivot the data to get one row per day with 24 columns for each hour
-    pivot_table = hourly_sum.pivot(index='date', columns='hour', values='value').fillna(0)
-
-    # Step 7: Rename columns to reflect hours
-    pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
-
-    # Step 8: Reset index to have 'date' as a column instead of index
-    pivot_table.reset_index(inplace=True)
-
-    # Step 9: Add day of the week, month, and year columns
-    pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
-    pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
-    pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year
-
-    # Step 10: One-hot encode the 'DayOfWeek' column
-    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
-
-    # Step 11: Convert hourly values to binary (True if > 0, else False)
-    for col in pivot_table.columns[1:25]:  # Skip the 'date' column and focus on hours
-        pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)
-
-    # Step 12: Add 'user' column with the specified user label
-    pivot_table['user'] = user_label
-      # Print which file is currently being processed
-    print(file_path,user_label)
-    # Step 13: Drop the 'DayOfWeek' column
-    pivot_table.drop(columns=['DayOfWeek'], inplace=True)
-
-    return pivot_table
-
-# List of files to skip
-files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
-                  'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv','StepCount27.csv', 'StepCount31.csv','StepCount32.csv',
-                  'StepCount42.csv', 'StepCount46.csv'}
-
-# Generate file paths, skipping specified files
-file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
-              if f'StepCount{i:02d}.csv' not in files_to_skip]
-
-# Generate user labels based on file index
-user_labels = list(range(len(file_paths)))
-
-
-# Process each file with its corresponding user label and concatenate the results
-processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
-combined_df = pd.concat(processed_dfs, ignore_index=True)
-
-# Save the combined DataFrame to a new Excel file
-updated_file_path = '/content/combined_aggregated_data.xlsx'
-combined_df.to_excel(updated_file_path, index=False)
-
-# Print the final DataFrame
-print(combined_df)
-
-def process_file_15_min_no_threshold(file_path, user_label):
-    # Load the dataset
-    df = pd.read_csv(file_path, delimiter=';')
-
-    # Filter for iPhone devices
-    iphone_df = df[df['device'].str.contains('iPhone', na=False)]
+    # Filter for iPhone devices (ignore NaN safely)
+    iphone_df = df[df['device'].str.contains('iPhone', na=False)].copy()
+    if iphone_df.empty:
+        return pd.DataFrame()  # Skip empty or invalid files

    # Convert startDate to datetime
-    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
-
-    # Round down the startDate to the nearest 15-minute interval
-    iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T')
-
-    # Extract date, time, year, and month for 15-minute intervals
-    iphone_df['date'] = iphone_df['15min_interval'].dt.date
-    iphone_df['time'] = iphone_df['15min_interval'].dt.time
-    iphone_df['Year'] = iphone_df['15min_interval'].dt.year
-    iphone_df['Month'] = iphone_df['15min_interval'].dt.month
-
-    # Group by date, time, year, and month, then sum the values
-
-
-    interval_sum = iphone_df.groupby(['date', 'time', 'Year', 'Month'])['value'].sum().reset_index()
-
-    # Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
-    full_time_range = pd.date_range('00:00', '23:45', freq='15T').time
-
-    # Pivot the data to get one row per day with columns for each 15-minute interval
-    pivot_table = interval_sum.pivot(index=['date', 'Year', 'Month'], columns='time', values='value').fillna(0)
-
-    # Reindex to include all possible 15-minute intervals
+    iphone_df['startDate'] = pd.to_datetime(
+        iphone_df['startDate'], errors='coerce'
+    )
+    iphone_df.dropna(subset=['startDate'], inplace=True)
+
+    # Round down to the nearest interval dynamically
+    iphone_df['interval_start'] = iphone_df['startDate'].dt.floor(interval)
+
+    # Extract date and time components
+    iphone_df['date'] = iphone_df['interval_start'].dt.date
+    iphone_df['time'] = iphone_df['interval_start'].dt.time
+
+    # Apply threshold filtering if specified
+    if threshold is not None:
+        iphone_df = iphone_df[iphone_df['value'] > threshold]
+
+    # Group by date and time, summing step values within each interval
+    interval_sum = (
+        iphone_df.groupby(['date', 'time'])['value']
+        .sum()
+        .reset_index()
+    )
+
+    # Generate a full time range based on the chosen interval
+    full_time_range = pd.date_range('00:00', '23:59', freq=interval).time
+
+    # Pivot to make one row per date, columns as time intervals
+    pivot_table = interval_sum.pivot(
+        index='date', columns='time', values='value'
+    ).fillna(0)
+
+    # Ensure all intervals exist even if missing in data
    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)

-    # Rename columns to reflect 15-minute intervals
-    pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
+    # Rename columns for clarity
+    pivot_table.columns = [str(col) for col in pivot_table.columns]

-    # Convert interval values to boolean (True if > 0, else False)
-    pivot_table = pivot_table.apply(lambda col: col != 0, axis=0)
-
-    # Reset index to have 'date', 'Year', and 'Month' as columns instead of index
+    # Reset index to make 'date' a column again
    pivot_table.reset_index(inplace=True)

-    # Add day of the week
-    pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
-
-    # One-hot encode the 'DayOfWeek' column
-    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
-
-    # Add a user column with the specified user label
-    pivot_table['user'] = user_label
-
-    # Print which file is currently being processed
-    print(f"Processing file: {file_path}, User label: {user_label}")
-
-    return pivot_table
-
-# List of files to skip
-files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
-                  'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv',
-                  'StepCount42.csv', 'StepCount46.csv'}
-
-# Generate file paths, skipping specified files
-file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
-              if f'StepCount{i:02d}.csv' not in files_to_skip]
-
-# Generate user labels based on file index
-user_labels = list(range(len(file_paths)))
-
-# Process each file with its corresponding user label and concatenate the results
-processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
-combined_df = pd.concat(processed_dfs, ignore_index=True)
-
-# Save the combined DataFrame to a new Excel file
-updated_file_path = '/content/combined_aggregated_data_15min_without_threshold.xlsx'
-combined_df.to_excel(updated_file_path, index=False)
-
-# Print the final DataFrame
-print(combined_df)
-
-
-user_counts = combined_df['user'].value_counts()
-
-# Display the count of each user
-print(user_counts.sort_index())
-
-def process_file_15_min_with_threshold(file_path, user_label):
-    # Load the dataset
-    df = pd.read_csv(file_path, delimiter=';')
-
-    # Step 1: Filter for iPhone devices
-    iphone_df = df[df['device'].str.contains('iPhone', na=False)]  # Treat NaN as False
-
-    # Step 2: Select the desired columns
-    result = iphone_df[['startDate', 'endDate', 'value']]
-
-    # Step 3: Convert startDate to datetime
-    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
-
-    # Step 4: Round down the startDate to the nearest 15-minute interval
-    iphone_df['15min_interval'] = iphone_df['startDate'].dt.floor('15T')
-
-    # Step 5: Extract date and time
-    iphone_df['date'] = iphone_df['15min_interval'].dt.date
-    iphone_df['time'] = iphone_df['15min_interval'].dt.time
-
-    # Step 6: Group by date and time, then sum the values for 15-minute intervals
-    iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value'])
-    interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index()
-
-    # Step 7: Pivot the data to get one row per day with columns for each 15-minute interval
-    pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0)
-
-    # Step 8: Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
-    full_time_range = pd.date_range('00:00', '23:45', freq='15T').time
-
-    # Step 9: Reindex to include all possible 15-minute intervals and fill missing values with 0
-    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
-
-    # Step 10: Rename columns to reflect 15-minute intervals
-    pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
-
-    # Step 11: Reset index to have 'date' as a column instead of an index
-    pivot_table.reset_index(inplace=True)
-
-    # Step 12: Add day of the week, month, and year columns
-    pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
-    pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
-    pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year
-
-    # Step 13: One-hot encode the 'DayOfWeek' column
-    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
-
-    # Step 14: Convert 15-minute interval values to binary (True if > 0, else False)
-    for col in pivot_table.columns[1:97]:  # Skip the 'date' column and focus on 15-minute intervals
-        pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)
-
-    # Step 15: Add 'user' column with the specified user label
-    pivot_table['user'] = user_label
-
-    # Print which file is currently being processed
-    print(f"Processing file: {file_path}, User label: {user_label}")
-
-    # Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded
-    pivot_table.drop(columns=['DayOfWeek'], inplace=True)
-
-    return pivot_table
-
-# List of files to skip
-files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
-                  'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv',
-                  'StepCount42.csv', 'StepCount46.csv'}
-
-# Generate file paths, skipping specified files
-file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
-              if f'StepCount{i:02d}.csv' not in files_to_skip]
-
-# Generate user labels based on file index
-user_labels = list(range(len(file_paths)))
-
-# Process each file with its corresponding user label and concatenate the results
-processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
-combined_df = pd.concat(processed_dfs, ignore_index=True)
-
-# Save the combined DataFrame to a new Excel file
-updated_file_path = '/content/combined_aggregated_data_15min_with_threshold.xlsx'
-combined_df.to_excel(updated_file_path, index=False)
-
-# Print the final DataFrame
-print(combined_df)
-
-
-def process_file_1_hour_with_threshold(file_path, user_label):
-    # Load the dataset
-    df = pd.read_csv(file_path, delimiter=';')
-
-    # Step 1: Filter for iPhone devices
-    iphone_df = df[df['device'].str.contains('iPhone', na=False)]  # Treat NaN as False
-
-    # Step 2: Select the desired columns
-    result = iphone_df[['startDate', 'endDate', 'value']]
-
-    # Step 3: Convert startDate to datetime
-    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
-
-    # Step 4: Round down the startDate to the nearest 1-hour interval
-    iphone_df['1hr_interval'] = iphone_df['startDate'].dt.floor('H')
-
-    # Step 5: Extract date and time
-    iphone_df['date'] = iphone_df['1hr_interval'].dt.date
-    iphone_df['time'] = iphone_df['1hr_interval'].dt.time
-
-    # Step 6: Group by date and time, then sum the values for 1-hour intervals
-    iphone_df_filtered = iphone_df[iphone_df['value'] > 25].dropna(subset=['value'])
-    interval_sum = iphone_df.groupby(['date', 'time'])['value'].sum().reset_index()
-
-    # Step 7: Pivot the data to get one row per day with columns for each 1-hour interval
-    pivot_table = interval_sum.pivot(index='date', columns='time', values='value').fillna(0)
-
-    # Step 8: Create a full range of 1-hour intervals (00:00:00 to 23:00:00)
-    full_time_range = pd.date_range('00:00', '23:00', freq='H').time
-
-    # Step 9: Reindex to include all possible 1-hour intervals and fill missing values with 0
-    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
-
-    # Step 10: Rename columns to reflect 1-hour intervals
-    pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
-
-    # Step 11: Reset index to have 'date' as a column instead of an index
-    pivot_table.reset_index(inplace=True)
-
-    # Step 12: Add day of the week, month, and year columns
+    # Add temporal features
    pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()
    pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month
    pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year

-    # Step 13: One-hot encode the 'DayOfWeek' column
-    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')], axis=1)
+    # One-hot encode day of week
+    pivot_table = pd.concat(
+        [pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')],
+        axis=1
+    )

-    # Step 14: Convert 1-hour interval values to binary (True if > 0, else False)
-    for col in pivot_table.columns[1:25]:  # Skip the 'date' column and focus on 1-hour intervals
+    # Convert all time-interval columns to boolean (active or not)
+    for col in pivot_table.columns[1:1 + len(full_time_range)]:
        pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)

-    # Step 15: Add 'user' column with the specified user label
+    # Add user identifier
    pivot_table['user'] = user_label

-    # Print which file is currently being processed
-    print(f"Processing file: {file_path}, User label: {user_label}")
-
-    # Step 16: Drop the 'DayOfWeek' column as it has been one-hot encoded
+    # Drop original DayOfWeek (we have the one-hot encoded version)
    pivot_table.drop(columns=['DayOfWeek'], inplace=True)

    return pivot_table

-# List of files to skip
-files_to_skip = {'StepCount06.csv','StepCount10.csv','StepCount12.csv', 'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
-                  'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv', 'StepCount27.csv','StepCount31.csv','StepCount32.csv',
-                  'StepCount42.csv', 'StepCount46.csv'}
-
-# Generate file paths, skipping specified files
-file_paths = [f'/content/drive/My Drive/Data/iOS/StepCount{i:02d}.csv' for i in range(1, 47)
-              if f'StepCount{i:02d}.csv' not in files_to_skip]
-
-# Generate user labels based on file index
-user_labels = list(range(len(file_paths)))
-
-# Process each file with its corresponding user label and concatenate the results
-processed_dfs = [process_file(file_path, user_label) for file_path, user_label in zip(file_paths, user_labels)]
-combined_df = pd.concat(processed_dfs, ignore_index=True)
-
-# Save the combined DataFrame to a new Excel file
-updated_file_path = '/content/combined_aggregated_data_1hr_withthreshold.xlsx'
-combined_df.to_excel(updated_file_path, index=False)
-
-# Print the final DataFrame
-print(combined_df)

+def process_stepcount_files(input_folders, output_folder,
+                            files_to_skip=None, interval='1H', threshold=None):
+    """
+    Process multiple step count CSV files from given folders into one aggregated Excel dataset.
+
+    Parameters
+    ----------
+    input_folders : list of str
+        List of folders to scan recursively for CSV files.
+    output_folder : str
+        Folder path where the combined Excel file will be saved.
+    files_to_skip : set or list of str, optional
+        Filenames to ignore during processing.
+    interval : str, optional
+        Any valid pandas resampling interval.
+    threshold : float or None, optional
+        Minimum value for step count inclusion. If None, all values are used.
+
+    Returns
+    -------
+    pd.DataFrame
+        Combined DataFrame containing all processed user data.
+    """
+
+    # Ensure skip list is a set for fast lookup
+    files_to_skip = set(files_to_skip or [])
+
+    # Collect all CSV file paths
+    file_paths = []
+    for folder in input_folders:
+        for root, _, files in os.walk(folder):
+            for fname in files:
+                if fname.endswith('.csv') and fname not in files_to_skip:
+                    file_paths.append(os.path.join(root, fname))
+
+    # Assign user labels
+    user_labels = list(range(len(file_paths)))
+
+    # Process each file
+    processed_dfs = []
+    for file_path, user_label in zip(file_paths, user_labels):
+        df = process_single_file(file_path, user_label, interval, threshold)
+        if not df.empty:
+            processed_dfs.append(df)
+
+    # Combine all processed data
+    if not processed_dfs:
+        raise ValueError("No valid data files found for processing.")
+    combined_df = pd.concat(processed_dfs, ignore_index=True)
+
+    # Create output filename dynamically
+    threshold_label = (
+        f"threshold{int(threshold)}" if threshold is not None else "nothreshold"
+    )
+    interval_label = interval.replace(' ', '').replace(':', '')
+    output_filename = f"combined_aggregated_data_{interval_label}_{threshold_label}.xlsx"
+    output_path = os.path.join(output_folder, output_filename)
+
+    # Save to Excel
+    os.makedirs(output_folder, exist_ok=True)
+    combined_df.to_excel(output_path, index=False)
+
+    return combined_df
+
+
+# Example usage:
+# combined_df = process_stepcount_files(
+#     input_folders=['/path/to/data/folder'],
+#     output_folder='/path/to/output/folder',
+#     files_to_skip={'StepCount06.csv', 'StepCount10.csv'},
+#     interval='30T',    # Any valid pandas frequency, e.g. '5T', '10T', '2H', etc.
+#     threshold=25
+# )
+
+process_stepcount_files(["Step_Data_Project_India/Rest_of_the_World", "Step_Data_Project_India/Europe"], "Step_Data_Project_India/OuptutIndiaTest", interval="1H")
--- a/data_preprocessing_main.py
+++ b/data_preprocessing_main.py
@ -0,0 +1,36 @@
+import data_preprocessing
+
+# Example usage:
+# combined_df = process_stepcount_files(
+#     input_folders=[
+#         '/content/drive/My Drive/Data/iOS',
+#         '/content/drive/My Drive/Data/Watch'
+#     ],
+#     output_folder='/content/drive/My Drive/Data/Results',
+#     files_to_skip={
+#         'StepCount06.csv', 'StepCount10.csv', 'StepCount12.csv',
+#         'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
+#         'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv',
+#         'StepCount27.csv', 'StepCount31.csv', 'StepCount32.csv',
+#         'StepCount42.csv', 'StepCount46.csv'
+#     },
+#     interval='15T',     # or '1H'
+#     threshold=25        # or None
+# )
+
+input_folders=[
+         'Step_Data_Project_India/Europe/Europe',
+         'Step_Data_Project_India/Rest_of_the_World'
+]
+output_folder='Step_Data_Project_India/Preprocessing_Results'
+files_to_skip={
+         'StepCount06.csv', 'StepCount10.csv', 'StepCount12.csv',
+         'StepCount13.csv', 'StepCount15.csv', 'StepCount17.csv',
+         'StepCount18.csv', 'StepCount20.csv', 'StepCount24.csv',
+         'StepCount27.csv', 'StepCount31.csv', 'StepCount32.csv',
+         'StepCount42.csv', 'StepCount46.csv'
+}
+interval='15T'    
+threshold=25 
+
+combined_df = data_preprocessing.process_stepcount_files(input_folders, output_folder, files_to_skip, interval, threshold)