You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
296 lines
11 KiB
296 lines
11 KiB
import numpy as np # linear algebra
|
|
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
|
from pandas import ExcelWriter
|
|
import shutil
|
|
import os
|
|
import keras_tuner as kt
|
|
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import LSTM, Dense, Dropout,GRU,Bidirectional
|
|
from tensorflow.keras.optimizers import Adam
|
|
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
|
|
from keras_tuner import RandomSearch
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
# === Clean previous tuning directory ===
|
|
shutil.rmtree("./working/tuner", ignore_errors=True)
|
|
|
|
# === Load dataset ===
|
|
file_path = './Datasets/ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'
|
|
|
|
df = pd.read_excel(file_path)
|
|
|
|
# === Helper functions for scenario selection ===
|
|
def get_user_input_for_scenario(scenario_type):
|
|
print(f"\nPlease define your custom {scenario_type} scenario:")
|
|
years_input = input(f"Enter {scenario_type} years (comma-separated, e.g., 2017,2018): ").strip()
|
|
years = list(map(int, years_input.split(',')))
|
|
years_months = []
|
|
for year in years:
|
|
months_input = input(f"Enter months for year {year} (comma-separated, e.g., 1,2,3): ").strip()
|
|
months = list(map(int, months_input.split(',')))
|
|
years_months.append((year, months))
|
|
return years_months
|
|
|
|
def display_warning_about_2020_data():
|
|
print("\n⚠️ Warning: 2020 data after February is excluded due to COVID-19.")
|
|
print("✅ Only Jan and Feb 2020 are used for testing. Do not use them in training/validation.")
|
|
|
|
def display_warnings_for_scenarios(scenario_type):
|
|
if scenario_type == "training":
|
|
print("\n⚠️ Predefined Training Scenarios (for reference only):")
|
|
for name, scenario in predefined_training_scenarios.items():
|
|
parts = [f"{year}-{months}" for year, months in scenario['years_months']]
|
|
print(f" {name}: {', '.join(parts)}")
|
|
elif scenario_type == "validation":
|
|
print("\n⚠️ Predefined Validation Scenario:")
|
|
for name, scenario in predefined_validation_scenarios.items():
|
|
parts = [f"{year}-{months}" for year, months in scenario['years_months']]
|
|
print(f" {name}: {', '.join(parts)}")
|
|
print(" - This uses Oct, Nov, Dec of 2019")
|
|
|
|
predefined_training_scenarios = {
|
|
"Scenario 1": {"years_months": [(2018, list(range(1, 13))), (2019, list(range(1, 10)))]},
|
|
"Scenario 2": {"years_months": [(2017, list(range(1, 13))), (2018, list(range(1, 13))), (2019, list(range(1, 10)))]}
|
|
}
|
|
predefined_validation_scenarios = {
|
|
"Scenario A": {"years_months": [(2019, [10, 11, 12])]}
|
|
}
|
|
|
|
# === Filter and preprocess data ===
|
|
def filter_data(df, scenario):
|
|
filtered = pd.DataFrame()
|
|
for year, months in scenario:
|
|
filtered = pd.concat([filtered, df[(df['Year'] == year) & (df['Month'].isin(months))]])
|
|
return filtered.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'])
|
|
|
|
# === Get test scenario input ===
|
|
def get_user_input_for_test():
|
|
print("\n=== Testing Scenario Setup ===")
|
|
print("⚠️ Only January and February of 2020 were used for testing in predefined setup.")
|
|
print("⚠️ Avoid using 2020 data after February due to COVID-19 impact.\n")
|
|
years_input = input("Enter test years (comma-separated, e.g., 2020): ").strip()
|
|
years = list(map(int, years_input.split(',')))
|
|
years_months = []
|
|
for year in years:
|
|
months_input = input(f"Enter months for year {year} (comma-separated, e.g., 1,2): ").strip()
|
|
months = list(map(int, months_input.split(',')))
|
|
years_months.append((year, months))
|
|
return years_months
|
|
|
|
def filter_test_data(df, scenario):
|
|
data_parts = []
|
|
for year, months in scenario:
|
|
part = df[(df['Year'] == year) & (df['Month'].isin(months))]
|
|
data_parts.append(part)
|
|
return pd.concat(data_parts, ignore_index=True)
|
|
|
|
def evaluate_model_on_test_data(model, test_df, sequence_length, excel_writer):
|
|
print("\n🧪 Evaluating on Test Data...")
|
|
test_df = test_df.drop(columns=['Month', 'Year', 'date', 'DayOfWeek'])
|
|
test_df = test_df.sort_values(by='user').reset_index(drop=True)
|
|
|
|
users = test_df['user'].unique()
|
|
results = []
|
|
accuracy_above_50 = 0
|
|
|
|
for user in users:
|
|
user_df = test_df[test_df['user'] == user]
|
|
X, y_true = [], []
|
|
user_features = user_df.drop(columns=['user']).values
|
|
user_labels = user_df['user'].values
|
|
|
|
if len(user_df) <= sequence_length:
|
|
print(f"Skipping User {user} (not enough data for sequence length {sequence_length})")
|
|
continue
|
|
|
|
for i in range(len(user_df) - sequence_length):
|
|
seq_x = user_features[i:i + sequence_length]
|
|
seq_y = user_labels[i + sequence_length]
|
|
X.append(seq_x)
|
|
y_true.append(seq_y)
|
|
|
|
X = np.array(X)
|
|
y_true = np.array(y_true)
|
|
|
|
if len(X) == 0:
|
|
continue
|
|
|
|
y_pred = model.predict(X, verbose=0)
|
|
y_pred_classes = np.argmax(y_pred, axis=1)
|
|
|
|
unique_pred, counts_pred = np.unique(y_pred_classes, return_counts=True)
|
|
label_counts_pred = dict(zip(unique_pred, counts_pred))
|
|
|
|
unique_true, counts_true = np.unique(y_true, return_counts=True)
|
|
label_counts_true = dict(zip(unique_true, counts_true))
|
|
|
|
acc = accuracy_score(y_true, y_pred_classes)
|
|
if acc > 0.5:
|
|
accuracy_above_50 += 1
|
|
|
|
# Append result to list
|
|
results.append({
|
|
'User': user,
|
|
'Accuracy (%)': acc * 100,
|
|
'Predicted Class Distribution': str(label_counts_pred),
|
|
'Actual Class Distribution': str(label_counts_true)
|
|
})
|
|
|
|
print(f"\n=== User {user} ===")
|
|
print(f"✅ Accuracy: {acc * 100:.2f}%")
|
|
print("📊 Predicted Class Distribution:", label_counts_pred)
|
|
print("📌 Actual Class Distribution: ", label_counts_true)
|
|
|
|
final_accuracy_percent = (accuracy_above_50 / 32) * 100
|
|
print(f"\n🟩 Final Evaluation Summary for Sequence Length {sequence_length}:")
|
|
print(f"Users with >50% Accuracy: {accuracy_above_50} / 32")
|
|
print(f"✅ Final Success Rate: {final_accuracy_percent:.2f}%")
|
|
|
|
# Append overall stats as a new row
|
|
results.append({
|
|
'User': 'TOTAL',
|
|
'Accuracy (%)': '',
|
|
'Predicted Class Distribution': f'Users >50% Acc: {accuracy_above_50}/32',
|
|
'Actual Class Distribution': f'Success Rate: {final_accuracy_percent:.2f}%'
|
|
})
|
|
|
|
# Save results to Excel sheet
|
|
df_results = pd.DataFrame(results)
|
|
df_results.to_excel(excel_writer, sheet_name=f"SeqLen_{sequence_length}", index=False)
|
|
|
|
|
|
# === Get user-defined training and validation scenarios ===
|
|
print("=== Training Scenario Setup ===")
|
|
display_warning_about_2020_data()
|
|
display_warnings_for_scenarios("training")
|
|
training_scenario = get_user_input_for_scenario("training")
|
|
|
|
print("\n=== Validation Scenario Setup ===")
|
|
display_warning_about_2020_data()
|
|
display_warnings_for_scenarios("validation")
|
|
validation_scenario = get_user_input_for_scenario("validation")
|
|
|
|
data = filter_data(df, training_scenario)
|
|
data_val = filter_data(df, validation_scenario)
|
|
|
|
# === Organize by user ===
|
|
df_sorted = data.sort_values(by='user').reset_index(drop=True)
|
|
df_sorted_val = data_val.sort_values(by='user').reset_index(drop=True)
|
|
users = df_sorted['user'].unique()
|
|
users_val = df_sorted_val['user'].unique()
|
|
|
|
user_data = {user: df_sorted[df_sorted['user'] == user] for user in users}
|
|
user_data_val = {user: df_sorted_val[df_sorted_val['user'] == user] for user in users_val}
|
|
|
|
# === Callbacks ===
|
|
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
|
|
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
|
|
|
|
# === Model tuning and training loop ===
|
|
best_models = {}
|
|
|
|
for sequence_length in range(20, 30, 5):
|
|
print(f"\n=== Training for Sequence Length: {sequence_length} ===")
|
|
|
|
# Training data
|
|
X, y = [], []
|
|
for user, data in user_data.items():
|
|
features = data.drop('user', axis=1).values
|
|
labels = data['user'].values
|
|
for i in range(len(features) - sequence_length):
|
|
X.append(features[i:i + sequence_length])
|
|
y.append(labels[i + sequence_length])
|
|
X = np.array(X)
|
|
y = np.array(y)
|
|
|
|
# Validation data
|
|
X_val, y_val = [], []
|
|
for user, data in user_data_val.items():
|
|
features = data.drop('user', axis=1).values
|
|
labels = data['user'].values
|
|
for i in range(len(features) - sequence_length):
|
|
X_val.append(features[i:i + sequence_length])
|
|
y_val.append(labels[i + sequence_length])
|
|
X_val = np.array(X_val)
|
|
y_val = np.array(y_val)
|
|
|
|
if X.shape[0] == 0 or X_val.shape[0] == 0:
|
|
print(f"⚠️ Skipped sequence length {sequence_length} due to insufficient data.")
|
|
continue
|
|
|
|
n_features = X.shape[2]
|
|
|
|
def build_model(hp):
|
|
model = Sequential()
|
|
model.add(Bidirectional(LSTM(units=hp.Int('units', 32, 256, step=2),
|
|
input_shape=(sequence_length, n_features))))
|
|
model.add(Dropout(hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
|
|
model.add(Dense(len(users), activation='softmax'))
|
|
model.compile(
|
|
optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
|
|
loss='sparse_categorical_crossentropy',
|
|
metrics=['accuracy']
|
|
)
|
|
return model
|
|
|
|
tuner = RandomSearch(
|
|
build_model,
|
|
objective='val_loss',
|
|
max_trials=30,
|
|
executions_per_trial=2,
|
|
directory='./working/tuner',
|
|
project_name=f'lstm_seq_{sequence_length}'
|
|
)
|
|
|
|
tuner.search(X, y, epochs=30, validation_data=(X_val, y_val),
|
|
callbacks=[early_stopping, lr_scheduler], verbose=1)
|
|
|
|
best_hps = tuner.get_best_hyperparameters(1)[0]
|
|
best_model = tuner.hypermodel.build(best_hps)
|
|
best_model.fit(X, y, epochs=30, validation_data=(X_val, y_val),
|
|
callbacks=[early_stopping, lr_scheduler], verbose=0)
|
|
|
|
best_models[sequence_length] = {
|
|
'model': best_model,
|
|
'best_hyperparameters': {
|
|
'units': best_hps.get('units'),
|
|
'dropout_rate': best_hps.get('dropout_rate'),
|
|
'learning_rate': best_hps.get('learning_rate')
|
|
}
|
|
}
|
|
|
|
# === Run evaluation for each trained sequence length ===
|
|
test_scenario = get_user_input_for_test()
|
|
test_data = filter_test_data(df, test_scenario)
|
|
|
|
output_excel_path = "./working/evaluation_results.xlsx"
|
|
|
|
with ExcelWriter(output_excel_path) as writer:
|
|
for sequence_length, result in best_models.items():
|
|
print(f"\n🔍 Testing Model for Sequence Length: {sequence_length}")
|
|
evaluate_model_on_test_data(
|
|
result['model'],
|
|
test_data.copy(),
|
|
sequence_length,
|
|
writer # 👈 pass the writer
|
|
)
|
|
|
|
print(f"\n✅ All evaluations completed. Results saved to: {output_excel_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|