Step_Data_Project_India/data_preprocessing.py


								import os

								import pandas as pd


								def process_single_file(file_path, user_label, interval='1H', threshold=None):

								    """

								    Process a single step count CSV file into a pivoted daily activity DataFrame.


								    Parameters

								    ----------

								    file_path : str

								        Path to the CSV file.

								    user_label : int

								        Unique label assigned to the user represented by this file.

								    interval : str, optional

								        Any valid pandas resampling interval (e.g., '1H', '15T', '30min', '5min').

								    threshold : float or None, optional

								        Minimum step count value to include in aggregation.

								        If None, all values are included (no filtering).


								    Returns

								    -------

								    pd.DataFrame

								        A DataFrame where each row represents one day of activity with

								        boolean indicators for each time interval, plus temporal and user info.

								    """


								    # Load dataset with flexible column handling

								    df = pd.read_csv(file_path, delimiter=';')


								    # Ensure required columns exist

								    required_cols = {'device', 'startDate', 'value'}

								    if not required_cols.issubset(df.columns):

								        raise ValueError(f"Missing required columns in {file_path}: {required_cols - set(df.columns)}")


								    # Filter for iPhone devices (ignore NaN safely)

								    iphone_df = df[df['device'].str.contains('iPhone', na=False)].copy()

								    if iphone_df.empty:

								        return pd.DataFrame()  # Skip empty or invalid files


								    # Convert startDate to datetime

								    iphone_df['startDate'] = pd.to_datetime(

								        iphone_df['startDate'], errors='coerce'

								    )

								    iphone_df.dropna(subset=['startDate'], inplace=True)


								    # Round down to the nearest interval dynamically

								    iphone_df['interval_start'] = iphone_df['startDate'].dt.floor(interval)


								    # Extract date and time components

								    iphone_df['date'] = iphone_df['interval_start'].dt.date

								    iphone_df['time'] = iphone_df['interval_start'].dt.time


								    # Apply threshold filtering if specified

								    if threshold is not None:

								        iphone_df = iphone_df[iphone_df['value'] > threshold]


								    # Group by date and time, summing step values within each interval

								    interval_sum = (

								        iphone_df.groupby(['date', 'time'])['value']

								        .sum()

								        .reset_index()

								    )


								    # Generate a full time range based on the chosen interval

								    full_time_range = pd.date_range('00:00', '23:59', freq=interval).time


								    # Pivot to make one row per date, columns as time intervals

								    pivot_table = interval_sum.pivot(

								        index='date', columns='time', values='value'

								    ).fillna(0)


								    # Ensure all intervals exist even if missing in data

								    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)


								    # Rename columns for clarity

								    pivot_table.columns = [str(col) for col in pivot_table.columns]


								    # Reset index to make 'date' a column again

								    pivot_table.reset_index(inplace=True)


								    # Add temporal features

								    pivot_table['DayOfWeek'] = pd.to_datetime(pivot_table['date']).dt.day_name()

								    pivot_table['Month'] = pd.to_datetime(pivot_table['date']).dt.month

								    pivot_table['Year'] = pd.to_datetime(pivot_table['date']).dt.year


								    # One-hot encode day of week

								    pivot_table = pd.concat(

								        [pivot_table, pd.get_dummies(pivot_table['DayOfWeek'], prefix='DayOfWeek')],

								        axis=1

								    )


								    # Convert all time-interval columns to boolean (active or not)

								    for col in pivot_table.columns[1:1 + len(full_time_range)]:

								        pivot_table[col] = pivot_table[col].apply(lambda x: True if x > 0 else False)


								    # Add user identifier

								    pivot_table['user'] = user_label


								    # Drop original DayOfWeek (we have the one-hot encoded version)

								    pivot_table.drop(columns=['DayOfWeek'], inplace=True)


								    return pivot_table


								def process_stepcount_files(input_folders, output_folder,

								                            files_to_skip=None, interval='1H', threshold=None):

								    """

								    Process multiple step count CSV files from given folders into one aggregated Excel dataset.


								    Parameters

								    ----------

								    input_folders : list of str

								        List of folders to scan recursively for CSV files.

								    output_folder : str

								        Folder path where the combined Excel file will be saved.

								    files_to_skip : set or list of str, optional

								        Filenames to ignore during processing.

								    interval : str, optional

								        Any valid pandas resampling interval.

								    threshold : float or None, optional

								        Minimum value for step count inclusion. If None, all values are used.


								    Returns

								    -------

								    pd.DataFrame

								        Combined DataFrame containing all processed user data.

								    """


								    # Ensure skip list is a set for fast lookup

								    files_to_skip = set(files_to_skip or [])


								    # Collect all CSV file paths

								    file_paths = []

								    for folder in input_folders:

								        for root, _, files in os.walk(folder):

								            for fname in files:

								                if fname.endswith('.csv') and fname not in files_to_skip:

								                    file_paths.append(os.path.join(root, fname))


								    # Assign user labels

								    user_labels = list(range(len(file_paths)))


								    # Process each file

								    processed_dfs = []

								    for file_path, user_label in zip(file_paths, user_labels):

								        df = process_single_file(file_path, user_label, interval, threshold)

								        if not df.empty:

								            processed_dfs.append(df)


								    # Combine all processed data

								    if not processed_dfs:

								        raise ValueError("No valid data files found for processing.")

								    combined_df = pd.concat(processed_dfs, ignore_index=True)


								    # Create output filename dynamically

								    threshold_label = (

								        f"threshold{int(threshold)}" if threshold is not None else "nothreshold"

								    )

								    interval_label = interval.replace(' ', '').replace(':', '')

								    output_filename = f"combined_aggregated_data_{interval_label}_{threshold_label}.xlsx"

								    output_path = os.path.join(output_folder, output_filename)


								    # Save to Excel

								    os.makedirs(output_folder, exist_ok=True)

								    combined_df.to_excel(output_path, index=False)


								    return combined_df


								# Example usage:

								# combined_df = process_stepcount_files(

								#     input_folders=['/path/to/data/folder'],

								#     output_folder='/path/to/output/folder',

								#     files_to_skip={'StepCount06.csv', 'StepCount10.csv'},

								#     interval='30T',    # Any valid pandas frequency, e.g. '5T', '10T', '2H', etc.

								#     threshold=25

								# )


								process_stepcount_files(["Step_Data_Project_India/Rest_of_the_World", "Step_Data_Project_India/Europe"], "Step_Data_Project_India/OuptutIndiaTest", interval="1H")