You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

141 lines
5.6 KiB

  1. import os
  2. import pandas as pd
  3. from main import month_str, year_str, time_str, date_str, day_of_week_str, user_str, dataset_min_path, dataset_hrs_path, \
  4. week_column_names
  5. def process_file_one_hour(file_path, user_label):
  6. # Load the dataset
  7. df = pd.read_csv(file_path, delimiter=';', low_memory=False)
  8. # Filter for iPhone devices
  9. iphone_df = df[df['device'].str.contains('iPhone', na=False)] # Treat NaN as False
  10. # Convert startDate to datetime
  11. iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
  12. # Extract date and hour
  13. hour_str = 'hour'
  14. iphone_df[hour_str] = iphone_df['startDate'].dt.hour
  15. iphone_df[date_str] = iphone_df['startDate'].dt.date
  16. iphone_df[year_str] = iphone_df['startDate'].dt.year
  17. iphone_df[month_str] = iphone_df['startDate'].dt.month
  18. # Group by date and hour, then sum the values
  19. hourly_sum = iphone_df.groupby([date_str, hour_str, year_str, month_str])['value'].sum().reset_index()
  20. # Pivot the data to get one row per day with 24 columns for each hour
  21. pivot_table = hourly_sum.pivot(index=[date_str, year_str, month_str],
  22. columns=hour_str, values='value').fillna(0)
  23. pivot_table = pivot_table.astype(int) # float because of the filled nas
  24. # Rename columns to reflect hours
  25. pivot_table.columns = [f'Hour_{i}' for i in pivot_table.columns]
  26. all_hours = ['Hour_'+ str(i) for i in range(24)]
  27. for hours in all_hours:
  28. if hours not in pivot_table.columns:
  29. pivot_table[hours] = 0
  30. # Reset index
  31. pivot_table.reset_index(inplace=True)
  32. # Add day of the week, month, and year columns
  33. pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
  34. # One-hot encode the 'DayOfWeek' column
  35. pivot_table = pd.concat([pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
  36. for week_day_col in week_column_names:
  37. if week_day_col not in pivot_table.columns:
  38. pivot_table[week_day_col] = 0
  39. # Add 'user' column with the specified user label
  40. pivot_table[user_str] = user_label
  41. # Step 13: Drop the 'DayOfWeek' column
  42. pivot_table.drop(columns=[day_of_week_str], inplace=True)
  43. return pivot_table
  44. def process_file_15_min(file_path, user_label):
  45. interval_str = '15min_interval'
  46. # Load the dataset
  47. df = pd.read_csv(file_path, delimiter=';', low_memory=False)
  48. # TODO: evtl. nicht nur iPhone date nutzen
  49. # Filter for iPhone devices
  50. iphone_df = df[df['device'].str.contains('iPhone', na=False)]
  51. # Convert startDate to datetime
  52. iphone_df['startDate'] = pd.to_datetime(iphone_df['startDate'], format='%Y-%m-%d %H:%M:%S %z')
  53. # Round down the startDate to the nearest 15-minute interval
  54. iphone_df[interval_str] = iphone_df['startDate'].dt.floor('15min')
  55. # Extract date, time, year, and month for 15-minute intervals
  56. iphone_df[date_str] = iphone_df[interval_str].dt.date
  57. iphone_df[time_str] = iphone_df[interval_str].dt.time
  58. iphone_df[year_str] = iphone_df[interval_str].dt.year
  59. iphone_df[month_str] = iphone_df[interval_str].dt.month
  60. # Group by date, time, year, and month, then sum the values
  61. interval_sum = iphone_df.groupby([date_str, time_str, year_str, month_str])['value'].sum().reset_index()
  62. # Create a full range of 15-minute intervals (00:00:00 to 23:45:00)
  63. full_time_range = pd.date_range('00:00', '23:45', freq='15min').time
  64. # Pivot the data to get one row per day with columns for each 15-minute interval
  65. pivot_table = interval_sum.pivot(index=[date_str, year_str, month_str], columns=time_str,
  66. values='value').fillna(0)
  67. pivot_table = pivot_table.astype(int) # float because of the filled nas
  68. # Reindex to include all possible 15-minute intervals
  69. pivot_table = pivot_table.reindex(columns=full_time_range, fill_value=0)
  70. # Rename columns to reflect 15-minute intervals
  71. pivot_table.columns = [f'{str(col)}' for col in pivot_table.columns]
  72. # Reset index to have 'date', 'Year', and 'Month' as columns instead of index
  73. pivot_table.reset_index(inplace=True)
  74. # Add day of the week
  75. pivot_table[day_of_week_str] = pd.to_datetime(pivot_table[date_str]).dt.day_name()
  76. # One-hot encode the 'DayOfWeek' column
  77. pivot_table = pd.concat(
  78. [pivot_table, pd.get_dummies(pivot_table[day_of_week_str], prefix=day_of_week_str, dtype=int)], axis=1)
  79. for week_day_col in week_column_names:
  80. if week_day_col not in pivot_table.columns:
  81. pivot_table[week_day_col] = 0
  82. # Add a user column with the specified user label
  83. pivot_table[user_str] = user_label
  84. pivot_table.drop(columns=[day_of_week_str], inplace=True)
  85. return pivot_table
  86. if __name__ == "__main__":
  87. pd.options.mode.copy_on_write = True
  88. # Generate file paths
  89. files = (['Europe/Europe/'+file for file in os.listdir('Europe/Europe/')]
  90. + ['Rest_of_the_World/'+file for file in os.listdir('Rest_of_the_World')])
  91. # Generate user labels based on file index
  92. user_labels = list(range(len(files)))
  93. for save_name, process_func in [(dataset_hrs_path, process_file_one_hour),
  94. (dataset_min_path, process_file_15_min)]:
  95. # Process each file with its corresponding user label and concatenate the results
  96. processed_dfs = [process_func(file_path, user_label) for file_path, user_label in zip(files, user_labels)]
  97. combined_df = pd.concat(processed_dfs, ignore_index=True)
  98. # Save the combined DataFrame to a new json file
  99. combined_df.to_json(save_name, index=False)
  100. print('Done')