Step_Data_Project_India/preprocessing_new.py


								import os


								import pandas as pd


								from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \

								    week_column_names


								def process_file_one_hour(file_path, user_label):

								    # Load the dataset

								    df = pd.read_csv(file_path, delimiter=';', low_memory=False)


								    # Filter for iPhone devices

								    iphone_df = df[df['device'].str.contains('iPhone', na=False)]  # Treat NaN as False


								    # Convert startDate to datetime

								    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')


								    # Extract date and hour

								    hour_str = 'hour'

								    iphone_df[hour_str] = iphone_df['startDate'].dt.hour

								    iphone_df[date_str] = iphone_df['startDate'].dt.date

								    iphone_df[year_str] = iphone_df['startDate'].dt.year

								    iphone_df[month_str] = iphone_df['startDate'].dt.month


								    # Group by date and hour, then sum the values

								    hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index()


								    # Pivot the data to get one row per day with 24 columns for each hour

								    pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str],

								                                   columns=hour_str, values='value').fillna(0)


								    pivot_table = pivot_table.astype(int) # float because of the filled nas


								    # Rename columns to reflect hours

								    pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]

								    all_hours = ['Hour_'+ str(i) for i in range(24)]

								    for hours in all_hours:

								        if hours not in pivot_table.columns:

								            pivot_table[hours] = 0


								    # Reset index

								    pivot_table.reset_index(inplace=True)


								    # Add day of the week, month, and year columns

								    pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()


								    # One-hot encode the 'DayOfWeek' column

								    pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)

								    for week_day_col in week_column_names:

								        if week_day_col not in pivot_table.columns:

								            pivot_table[week_day_col] = 0


								    # Add 'user' column with the specified user label

								    pivot_table[user_str] = user_label


								    # Step 13: Drop the 'DayOfWeek' column

								    pivot_table.drop(columns=[day_of_week_str], inplace=True)


								    return pivot_table


								def process_file_15_min(file_path, user_label):

								    interval_str = '15min_interval'


								    # Load the dataset

								    df = pd.read_csv(file_path, delimiter=';', low_memory=False)


								    # TODO: evtl. nicht nur iPhone date nutzen

								    # Filter for iPhone devices

								    iphone_df = df[df['device'].str.contains('iPhone', na=False)]


								    # Convert startDate to datetime

								    iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')


								    # Round down the startDate to the nearest 15-minute interval

								    iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min')


								    # Extract date, time, year, and month for 15-minute intervals

								    iphone_df[date_str] = iphone_df[interval_str].dt.date

								    iphone_df[time_str] = iphone_df[interval_str].dt.time

								    iphone_df[year_str] = iphone_df[interval_str].dt.year

								    iphone_df[month_str] = iphone_df[interval_str].dt.month


								    # Group by date, time, year, and month, then sum the values

								    interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index()


								    # Create a full range of 15-minute intervals (00:00:00 to 23:45:00)

								    full_time_range = pd.date_range('00:00', '23:45', freq='15min').time


								    # Pivot the data to get one row per day with columns for each 15-minute interval

								    pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str,

								                                     values='value').fillna(0)

								    pivot_table = pivot_table.astype(int) # float because of the filled nas


								    # Reindex to include all possible 15-minute intervals

								    pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)


								    # Rename columns to reflect 15-minute intervals

								    pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]


								    # Reset index to have 'date', 'Year', and 'Month' as columns instead of index

								    pivot_table.reset_index(inplace=True)


								    # Add day of the week

								    pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()


								    # One-hot encode the 'DayOfWeek' column

								    pivot_table = pd.concat(

								        [pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)

								    for week_day_col in week_column_names:

								        if week_day_col not in pivot_table.columns:

								            pivot_table[week_day_col] = 0


								    # Add a user column with the specified user label

								    pivot_table[user_str] = user_label


								    pivot_table.drop(columns=[day_of_week_str], inplace=True)


								    return pivot_table


								if __name__ == "__main__":

								    pd.options.mode.copy_on_write = True

								    # Generate file paths, skipping specified files

								    files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]

								             + ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])


								    # Generate user labels based on file index

								    user_labels = list(range(len(files)))


								    for save_name, process_func in [(dataset_hrs_path, process_file_one_hour),

								                                    (dataset_min_path, process_file_15_min)]:

								        # Process each file with its corresponding user label and concatenate the results

								        processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)]


								        combined_df = pd.concat(processed_dfs, ignore_index=True)


								        # Save the combined DataFrame to a new Excel file

								        combined_df.to_json(save_name, index=False)

								        user_counts = combined_df[user_str].value_counts()


								    print('Done')