#!/usr/bin/env python
# coding: utf-8

# # ML Experimentation Pipeline

# 🎯 Goal: Final ML Pipeline.

# ## SETUP

# In[ ]:


# Import libraries
import numpy as np
import pandas as pd
from tqdm import tqdm

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(13,6)})


# In[ ]:


import plotly.express as px


# In[ ]:


# Set notebook mode to work in offline
import plotly.offline as pyo

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)


# In[ ]:


import sklearn
sklearn.__version__


# Scikit Learn Version: 1.3.0

# In[ ]:


from sklearn import set_config
set_config(transform_output = "pandas")


# In[ ]:


# Import Machine Learning libraries
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from sklearn.ensemble import VotingRegressor


# In[ ]:


import lightgbm as lgb
import xgboost as xgb 


# In[ ]:


import joblib


# # Feature Engineering (TRAIN)

# In[ ]:


# Load data
full_df = pd.read_csv("../data/input/train.csv")


# In[ ]:


# Check shape
full_df.shape


# In[ ]:


# Airline Encoding
full_df['Airline_enc'] = full_df['Airline'].map({'Airline A': 1, 'Airline B': 2, 'Airline C': 3})


# In[ ]:


# Extract info from "Arrival_City", "Departure_City"
prefixes = ["New", "Port", "Lake", "South", "East", "West", "North"]


def extract_prefixes(city):
    if city is np.NAN:
        return np.NAN
    else:
        ls = [prefix for prefix in prefixes if prefix in city]
        res = "-".join(ls)
        return res


# In[ ]:


# Arrival City
full_df["Arrival_City_prefix"] = full_df["Arrival_City"].apply(extract_prefixes)


# In[ ]:


# Arrival City Encoding
full_df["Arrival_City_prefix_enc"] = (full_df["Arrival_City_prefix"]
                                      .map({'':0,
                                            'New': 1, 
                                            'Port': 2, 
                                            'Lake': 3,
                                            'West':4,
                                            'East':5,
                                            'South':6,
                                            'North':7
                                           })
                                     )


# In[ ]:


# Destination City
full_df["Departure_City_prefix"] = full_df["Departure_City"].apply(extract_prefixes)


# In[ ]:


# Encoding
full_df["Departure_City_prefix_enc"] = (full_df["Departure_City_prefix"]
                                      .map({'':0,
                                            'New': 1, 
                                            'Port': 2, 
                                            'Lake': 3,
                                            'West':4,
                                            'East':5,
                                            'South':6,
                                            'North':7
                                           })
                                     )


# In[ ]:


# Arrival Time Hour
full_df["Arrival_Time_Hour"] = full_df["Arrival_Time"].apply(lambda x: x.split(":")[0]).astype(int)

# Arrival Time Number
full_df["Arrival_Time_num"] = full_df["Arrival_Time"].str.replace(":","").astype(int)

# Day or Night
full_df["Arrival_Day_Night"] = (full_df["Arrival_Time_num"] < 1_200).astype(int)


# In[ ]:


# Departure Time Hour
full_df["Departure_Time_Hour"] = full_df["Departure_Time"].apply(lambda x: x.split(":")[0]).astype(int)

# Departure Time Number
full_df["Departure_Time_num"] = full_df["Departure_Time"].str.replace(":","").astype(int)

# Day or Night
full_df["Departure_Day_Night"] = (full_df["Departure_Time_num"] < 1_200).astype(int)


# In[ ]:


# Aircraft Type
full_df['Aircraft_Type_enc'] = (full_df['Aircraft_Type']
                                .map({'Boeing 787':1, 'Airbus A320':2, 'Boeing 737':3, 
                                      'Boeing 777':4, 'Airbus A380':5})
                               )


# In[ ]:


# Day of Week
full_df['Day_of_Week_enc'] = (full_df['Day_of_Week']
                              .map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,
                                    'Friday': 5, 'Saturday': 6, 'Sunday': 7})
                             )


# In[ ]:


# Weekend
full_df['Weekend'] = (full_df['Day_of_Week_enc'] >= 6).astype(int)


# In[ ]:


# Month of Travel
full_df['Month_of_Travel_enc'] = (full_df['Month_of_Travel']
                                  .map({'January': 1, 'February': 2, 'March': 3, 
                                        'April': 4, 'May':5, 'June':6,
                                        'July': 7, 'August': 8, 'September': 9,
                                        'October':10, 'November':11, 'December':12})
                                 )

# Month of Travel Holiday
full_df['Month_of_Travel_Holiday_enc'] = (full_df['Month_of_Travel']
                                          .map({'January': 2, 'February': 3, 'March': 1, 
                                                'April': 2, 'May':3, 'June':1,
                                                'July': 2, 'August': 3, 'September': 1,
                                                'October':2, 'November':3, 'December':1})
                                         )


# In[ ]:


# Holiday
full_df['Holiday_Season_enc'] = (full_df['Holiday_Season']
                                  .map({'Summer': 1, 'Spring': 2, 'Fall': 3, 
                                        'Winter': 4, 'None':5})
                                 )


# In[ ]:


# Demand
full_df['Demand_enc'] = (full_df['Demand']
                                  .map({'Low': 1, 'Medium': 2, 'High': 3})
                                 )


# In[ ]:


#  Weather Conditions
full_df['Weather_Conditions_enc'] = (full_df['Weather_Conditions']
                                  .map({'Clear': 1, 'Cloudy': 2, 'Rain': 3, 'Snow':4})
                                 )


# In[ ]:


# Promotion Type
full_df['Promotion_Type_enc'] = (full_df['Promotion_Type']
                                  .map({'None': 1, 'Discount': 2, 'Special Offer': 3})
                                 )


# # Keep only relevant columns

# In[ ]:


# Check all columns
full_df.columns


# In[ ]:


# Extract only numerical columns
COLUMNS_TO_KEEP = [
    "Airline_enc",
    "Arrival_City_prefix_enc",
    "Departure_City_prefix_enc",
    "Arrival_Time_Hour",
    "Arrival_Time_num",
    "Arrival_Day_Night",
    "Departure_Time_Hour",
    "Departure_Time_num",
    "Departure_Day_Night",
    "Aircraft_Type_enc",
    "Day_of_Week_enc",
    "Weekend",
    "Distance",
    "Duration",
    "Number_of_Stops",
    "Passenger_Count",
    "Fuel_Price",
    "Demand_enc",
    "Weather_Conditions_enc",
    "Promotion_Type_enc",
    "Month_of_Travel_enc",
    "Holiday_Season_enc",
    "Month_of_Travel_Holiday_enc",
    "Flight_Price",
]


# In[ ]:


# Columns dropped
print("Columns dropped")
print(set(full_df.columns).difference(COLUMNS_TO_KEEP))


# In[ ]:


# Keep only relevant columns
full_df = full_df[COLUMNS_TO_KEEP].copy()


# In[ ]:


# Check shape
full_df.shape


# In[ ]:


# Check dataset
full_df


# In[ ]:


# Check datatypes
full_df.dtypes


# In[ ]:


# Check NAs
full_df.isna().sum().to_frame(name="Total missing values")


# In[ ]:


# Column names
full_df.columns


# In[ ]:


# Extract Features from the data

FEATURES = [
    'Airline_enc', 'Arrival_City_prefix_enc', 'Departure_City_prefix_enc',
    'Arrival_Time_Hour', 'Arrival_Time_num', 'Arrival_Day_Night',
    'Departure_Time_Hour', 'Departure_Time_num', 'Departure_Day_Night',
    'Aircraft_Type_enc', 'Day_of_Week_enc', 'Weekend', 'Distance',
    'Duration', 'Number_of_Stops', 'Passenger_Count', 'Fuel_Price',
    'Demand_enc', 'Weather_Conditions_enc', 'Promotion_Type_enc',
    'Month_of_Travel_enc', 'Holiday_Season_enc',
    'Month_of_Travel_Holiday_enc'
]

print("Total Features: ", len(FEATURES))
print("Features: ", FEATURES)

print("\n")

TARGET = 'Flight_Price'
print("Target: ", TARGET)


# # Define Cross-Validation

# In[ ]:


# Define Cross-Validation object
cv_strategy = KFold(n_splits=3, shuffle=True, random_state=42)


# # 1️⃣ SINGLE MODEL EXPERIMENTS

# ## HistGradientBoostingRegressor

# In[ ]:


# Define Model
model = HistGradientBoostingRegressor(random_state=42)
model


# In[ ]:


# Run Cross-Validation
cv_scores = cross_validate(
    estimator = model, 
    X = full_df[FEATURES],
    y = full_df[TARGET],
    cv=cv_strategy, 
    scoring='neg_root_mean_squared_error',
    return_train_score=True,
    verbose=3,
    n_jobs=3,
)


# In[ ]:


# Cross Validation Scores
cv_scores_df = pd.DataFrame(cv_scores)
cv_scores_df


# In[ ]:


# Average Score
train_cv_score = cv_scores_df["train_score"].mean().round(2)
test_cv_score = cv_scores_df["test_score"].mean().round(2)

print("Train CV Score:", train_cv_score)
print("Test CV Score:", test_cv_score)


# **BASE HIST SCORES:**
#     
# * Train CV Score: -12.84
# * Test CV Score: -14.2

# ## LightGBM

# In[ ]:


# Define Model
model = lgb.LGBMRegressor(random_state=42) 
model


# In[ ]:


# Run Cross-Validation
cv_scores = cross_validate(
    estimator = model, 
    X = full_df[lgb_features_list],
    y = full_df[TARGET],
    cv=cv_strategy, 
    scoring='neg_root_mean_squared_error',
    return_train_score=True,
    verbose=3
)


# In[ ]:


# Cross Validation Scores
cv_scores_df = pd.DataFrame(cv_scores)
cv_scores_df


# In[ ]:


# Average Score
train_cv_score = cv_scores_df["train_score"].mean().round(2)
test_cv_score = cv_scores_df["test_score"].mean().round(2)

print("Train CV Score:", train_cv_score)
print("Test CV Score:", test_cv_score)


# **BASE LGB SCORES:**
#     
# * Train CV Score: -12.59
# * Test CV Score: -13.97

# ## XGBoost

# In[ ]:


# Define Model
model = xgb.XGBRegressor(random_state=42)
model


# In[ ]:


# Run Cross-Validation
cv_scores = cross_validate(
    estimator = model, 
    X = full_df[FEATURES],
    y = full_df[TARGET],
    cv=cv_strategy, 
    scoring='neg_root_mean_squared_error',
    return_train_score=True,
    verbose=3
)


# In[ ]:


# Cross Validation Scores
cv_scores_df = pd.DataFrame(cv_scores)
cv_scores_df


# In[ ]:


# Average Score
train_cv_score = cv_scores_df["train_score"].mean().round(2)
test_cv_score = cv_scores_df["test_score"].mean().round(2)

print("Train CV Score:", train_cv_score)
print("Test CV Score:", test_cv_score)


# **BASE XGBOOST SCORES:**
#     
# * Train CV Score: -11.27
# * Test CV Score: -15.84

# # 3️⃣ FEATURE SELECTION

# ## HIST Permutation Importance

# In[ ]:


# Define the model
model = HistGradientBoostingRegressor(random_state=42)

# Fit the model
model.fit(X = full_df[FEATURES], y = full_df[TARGET])

# perform permutation importance
results = permutation_importance(
    estimator = model, 
    X = full_df[FEATURES], 
    y = full_df[TARGET],
    scoring='neg_mean_squared_error'
)

# get importance
importance = results.importances_mean


# In[ ]:


# Create a dataframe with permutation importances
feature_importances_df = pd.DataFrame(data = {"feature_name": FEATURES, "permutation_importance":importance})

feature_importances_df = (feature_importances_df
                          .sort_values("permutation_importance", ascending=False)
                          .reset_index(drop=True)
                         )

feature_importances_df


# In[ ]:


# Step 3: Custom forward feature selection based on the feature importances

# Get a featue list sorted based on feature importance
features_sorted_ls = list(feature_importances_df["feature_name"].values)

scores_df = pd.DataFrame()

for cutoff in tqdm(range(1, len(features_sorted_ls)+1)):
        
    # Extract training features
    feature_set = features_sorted_ls[:cutoff]
    print("Total features:", len(feature_set))
    print("Features:", feature_set)

    # Train and evaluate a baseline LightGBM model using cv
    model = HistGradientBoostingRegressor(random_state=42)
    
    cv_scores = cross_validate(estimator=model,
                               X=full_df[feature_set],
                               y=full_df[TARGET],
                               cv=cv_strategy,
                               scoring='neg_root_mean_squared_error',
                               return_train_score=True,
                               n_jobs=-1,
                               verbose=0)
    
    train_round_score = cv_scores["train_score"].mean()
    print("TRAIN SCORE:", train_round_score)
    
    test_round_score = cv_scores["test_score"].mean()
    print("TEST SCORE:", test_round_score)
    
    
    round_df = pd.DataFrame(data = {"total_features": len(feature_set),
                                    "features":[feature_set],
                                    "train_score": train_round_score,
                                    "test_score": test_round_score})
    
    scores_df = pd.concat([scores_df, round_df])
    scores_df = scores_df.reset_index(drop=True)
    
    print("-"*100)


# In[ ]:


# Check Scores
scores_df


# In[ ]:


# Extract the best score
scores_df.iloc[scores_df["test_score"].idxmax()]


# **FEATURE SELECTION HIST SCORES:**
# 
# * Train CV Score: -12.83
# * Test CV Score: -14.12

# In[ ]:


# Best Features
scores_df.iloc[scores_df["test_score"].idxmax()]["features"]


# In[ ]:


# Plot Scores
fig = px.line(data_frame=scores_df,
                x="total_features",
                y="test_score",
                markers=True,
                title="Average CV score per subset of features | HistGradientBoostingRegressor")

fig.add_scatter(x=scores_df["total_features"],
                y=scores_df["train_score"], 
                mode="lines+markers",
                name="Train CV Score"
               )

fig.show()


# In[ ]:


# Store feature selection dataset
# scores_df.to_csv("feature_selection_HistGradientBoostingRegressor_permutation_importances_scores.csv", index=False)


# In[ ]:


# Best features for HIST
hist_features_list = [
    'Distance',
    'Demand_enc',
    'Fuel_Price',
    'Duration',
    'Number_of_Stops',
    'Month_of_Travel_enc',
    'Day_of_Week_enc',
    'Weather_Conditions_enc',
    'Holiday_Season_enc',
    'Airline_enc']


# ## LightGBM

# In[ ]:


# Step 1: Train a simple LightGBM Regression model in the full dataset
model = lgb.LGBMRegressor(random_state=42)
model.fit(X = full_df[FEATURES], y = full_df[TARGET])


# In[ ]:


# Step 2: Extract Feature Importances
feature_importances_df = pd.DataFrame(data = {'feature_name': model.feature_name_,
                                              'importance': model.feature_importances_})

feature_importances_df = feature_importances_df.sort_values("importance", ascending=False)

print("Feature Importances:")
display(feature_importances_df)


# In[ ]:


print('Plot feature importances...')
ax = lgb.plot_importance(model, max_num_features=24)
plt.show()


# In[ ]:


# Step 3: Custom forward feature selection based on the feature importances

# Get a featue list sorted based on feature importance
features_sorted_ls = list(feature_importances_df["feature_name"].values)

scores_df = pd.DataFrame()

for cutoff in tqdm(range(1, len(features_sorted_ls)+1)):
        
    # Extract training features
    feature_set = features_sorted_ls[:cutoff]
    print("Total features:", len(feature_set))
    print("Features:", feature_set)

    # Train and evaluate a baseline LightGBM model using cv
    model = lgb.LGBMRegressor(random_state=42)
    
    cv_scores = cross_validate(estimator=model,
                               X=full_df[feature_set],
                               y=full_df[TARGET],
                               cv=cv_strategy,
                               scoring='neg_root_mean_squared_error',
                               return_train_score=True,
                               n_jobs=-1,
                               verbose=0)
    
    train_round_score = cv_scores["train_score"].mean()
    print("TRAIN SCORE:", train_round_score)
    
    test_round_score = cv_scores["test_score"].mean()
    print("TEST SCORE:", test_round_score)
    
    
    round_df = pd.DataFrame(data = {"total_features": len(feature_set),
                                    "features":[feature_set],
                                    "train_score": train_round_score,
                                    "test_score": test_round_score})
    
    scores_df = pd.concat([scores_df, round_df])
    scores_df = scores_df.reset_index(drop=True)
    
    print("-"*100)


# In[ ]:


# Check Scores
scores_df


# In[ ]:


# Extract the best score
scores_df.iloc[scores_df["test_score"].idxmax()]


# **FEATURE SELECTION LGB SCORES:**
#     
# * Train CV Score: -12.59
# * Test CV Score: -13.92

# In[ ]:


# Best Features
scores_df.iloc[scores_df["test_score"].idxmax()]["features"]


# In[ ]:


# Plot Scores
fig = px.line(data_frame=scores_df,
                x="total_features",
                y="test_score",
                markers=True,
                title="Average CV score per subset of features | LGBMRegressor")

fig.add_scatter(x=scores_df["total_features"],
                y=scores_df["train_score"], 
                mode="lines+markers",
                name="Train CV Score"
               )

fig.show()


# In[ ]:


# Store feature selection dataset
# scores_df.to_csv("feature_selection_LightGBM_Regressor_importances_scores.csv", index=False)


# In[ ]:


# Best features for LGB
lgb_features_list = [
    'Fuel_Price',
    'Distance',
    'Month_of_Travel_enc',
    'Demand_enc',
    'Number_of_Stops',
    'Duration',
    'Day_of_Week_enc',
    'Weather_Conditions_enc',
    'Arrival_Time_num',
    'Holiday_Season_enc']


# ## XGBoost

# In[ ]:


# Step 1: Train a simple XGBoost Regression model in the full dataset
model = xgb.XGBRegressor(random_state=42)
model.fit(X = full_df[FEATURES], y = full_df[TARGET])


# In[ ]:


# Step 2: Extract Feature Importances
feature_importances_df = pd.DataFrame(data = {'feature_name': model.feature_names_in_,
                                              'importance': model.feature_importances_})

feature_importances_df = feature_importances_df.sort_values("importance", ascending=False)

print("Feature Importances:")
display(feature_importances_df)


# In[ ]:


print('Plot feature importances...')
ax = xgb.plot_importance(model, max_num_features=24)
plt.show()


# In[ ]:


# Step 3: Custom forward feature selection based on the feature importances

# Get a featue list sorted based on feature importance
features_sorted_ls = list(feature_importances_df["feature_name"].values)

scores_df = pd.DataFrame()

for cutoff in tqdm(range(1, len(features_sorted_ls)+1)):
        
    # Extract training features
    feature_set = features_sorted_ls[:cutoff]
    print("Total features:", len(feature_set))
    print("Features:", feature_set)

    # Train and evaluate a baseline LightGBM model using cv
    model = xgb.XGBRegressor(random_state=42)
    
    cv_scores = cross_validate(estimator=model,
                               X=full_df[feature_set],
                               y=full_df[TARGET],
                               cv=cv_strategy,
                               scoring='neg_root_mean_squared_error',
                               return_train_score=True,
                               n_jobs=-1,
                               verbose=0)
    
    train_round_score = cv_scores["train_score"].mean()
    print("TRAIN SCORE:", train_round_score)
    
    test_round_score = cv_scores["test_score"].mean()
    print("TEST SCORE:", test_round_score)
    
    
    round_df = pd.DataFrame(data = {"total_features": len(feature_set),
                                    "features":[feature_set],
                                    "train_score": train_round_score,
                                    "test_score": test_round_score})
    
    scores_df = pd.concat([scores_df, round_df])
    scores_df = scores_df.reset_index(drop=True)
    
    print("-"*100)


# In[ ]:


# Check Scores
scores_df


# In[ ]:


# Extract the best score
scores_df.iloc[scores_df["test_score"].idxmax()]


# **FEATURE SELECTION XGBOOST SCORES:**
#     
# * Train CV Score: -11.55
# * Test CV Score: -15.34

# In[ ]:


# Best Features
scores_df.iloc[scores_df["test_score"].idxmax()]["features"]


# In[ ]:


# Plot Scores
fig = px.line(data_frame=scores_df,
                x="total_features",
                y="test_score",
                markers=True,
                title="Average CV score per subset of features | XGBoost Regressor")

fig.add_scatter(x=scores_df["total_features"],
                y=scores_df["train_score"], 
                mode="lines+markers",
                name="Train CV Score"
               )

fig.show()


# In[ ]:


# Store feature selection dataset
# scores_df.to_csv("feature_selection_XGBoost_Regressor_importances_scores.csv", index=False)


# In[ ]:


# Best features for XGB
xgb_features_list = [
    'Distance',
    'Demand_enc',
    'Duration',
    'Fuel_Price',
    'Number_of_Stops',
    'Month_of_Travel_enc',
    'Day_of_Week_enc',
    'Weather_Conditions_enc',
    'Holiday_Season_enc'
]


# # 4️⃣ HYPERPARAMETER TUNING

# **Randomized Hypterparameter tuning using the best subset of features.**

# ## HistGradientBoostingRegressor

# In[ ]:


# # Load Feature Selection object
# scores_df = pd.read_csv("feature_selection_HistGradientBoostingRegressor_permutation_importances_scores.csv")

# # Select features set with the best score
# best_score_loc = scores_df["test_score"].idxmax()
# hist_features_list = eval(scores_df.loc[best_score_loc, "features"])


# In[ ]:


print("Total features:", len(hist_features_list))
print("Features:", hist_features_list)


# In[ ]:


# LGB Regressor
model = HistGradientBoostingRegressor(random_state=42)

# Define hyperparameter ranges
param_distributions = {
    'quantile': np.linspace(0.01, 0.99, 99),  # Range from 0.01 to 0.99 for quantile
    'learning_rate': np.linspace(0.01, 0.5, 50),
    'max_iter': np.arange(100, 1001, 100),
    'max_leaf_nodes': np.arange(10, 101, 10),
    'max_depth': np.append(np.arange(3, 21), None),
    'min_samples_leaf': np.arange(1, 11),
    'l2_regularization': np.logspace(-6, 2, 100),
    'max_bins': np.arange(2, 256),
    'interaction_cst': ['pairwise', 'no_interactions', [{0, 1}, {2, 3, 4}]],
}

search = RandomizedSearchCV(estimator=model,
                            param_distributions=param_distributions,
                            n_iter=100,
                            scoring='neg_root_mean_squared_error',
                            cv=cv_strategy,
                            return_train_score=True,
                            verbose=4,
                            n_jobs=-1,
                            random_state=42)

# Run the hyperparameter search
search.fit(X=full_df[hist_features_list],
           y=full_df[TARGET])


# In[ ]:


# Access cross-validation results
cv_results = search.cv_results_

# Extract relevant metrics
params = cv_results['params']  # Hyperparameter configurations
mean_test_scores = cv_results['mean_test_score']  
std_test_scores = cv_results['std_test_score']  
mean_train_score = cv_results['mean_train_score']  
std_train_score = cv_results['std_train_score'] 


# Create a dataframe to store the metrics
cv_metrics_df = pd.DataFrame({'params': params,
                              'mean_test_score': mean_test_scores,
                              'std_test_score': std_test_scores,
                              'mean_train_score': mean_train_score,
                              'std_train_score': std_train_score
                             })

# Sort the dataframe by descending mean_test_score
cv_metrics_df = cv_metrics_df.sort_values(by='mean_test_score', ascending=False).reset_index(drop=True)

# Print the dataframe
cv_metrics_df


# In[ ]:


# Check best scores
cv_metrics_df.iloc[0].values


# In[ ]:


# Store cv_metrics dataset
# cv_metrics_df.to_csv("cv_metrics_df_HistGradientBoostingRegressor.csv", index=False)


# **TUNED HIST SCORES:**
# 
# * Train CV Score: 11.29
# * Test CV Score: 12.78
# 
# (Note: You can validate hyperparameter tuning results in the `ENSEMBLE` section, testing each pipeline separately.)

# In[ ]:


# BEST HYPERPARAMETERS FOR HIST
hist_best_params = {
    'quantile': 0.77,
    'min_samples_leaf': 3,
    'max_leaf_nodes': 40,
    'max_iter': 800,
    'max_depth': 18,
    'max_bins': 62,
    'learning_rate': 0.13,
    'l2_regularization': 8.697490026177834e-05,
    'interaction_cst': 'pairwise'
}


# ## LightGBM

# In[ ]:


# # Load Feature Selection object
# scores_df = pd.read_csv("feature_selection_LightGBM_Regressor_importances_scores.csv")

# # Select features set with the best score
# best_score_loc = scores_df["test_score"].idxmax()
# lgb_features_list = eval(scores_df.loc[best_score_loc, "features"])


# In[ ]:


print("Total features:", len(lgb_features_list))
print("Features:", lgb_features_list)


# In[ ]:


# LGB Regressor
model = lgb.LGBMRegressor(random_state=42)

# Define hyperparameter ranges
param_distributions = {
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'num_leaves': list(range(8, 128, 4)),
    'max_depth': list(range(1, 32, 2)),
    'learning_rate': np.logspace(-5, 0, 20),
    'n_estimators': list(range(10, 1000, 10)),
    'min_child_samples': list(range(1, 200, 50)),
    'reg_alpha': np.logspace(-5, 0, 20),
    'reg_lambda': np.logspace(-5, 0, 20),
    'subsample': np.linspace(0.1, 1.0, 10),
    'colsample_bytree': np.linspace(0.1, 1.0, 10),
}

search = RandomizedSearchCV(estimator=model,
                            param_distributions=param_distributions,
                            n_iter=100,
                            scoring='neg_root_mean_squared_error',
                            cv=cv_strategy,
                            return_train_score=True,
                            verbose=2,
                            n_jobs=-1,
                            random_state=42)

# Run the hyperparameter search
search.fit(X=full_df[lgb_features_list],
           y=full_df[TARGET])


# In[ ]:


# Access cross-validation results
cv_results = search.cv_results_

# Extract relevant metrics
params = cv_results['params']  # Hyperparameter configurations
mean_test_scores = cv_results['mean_test_score'] 
std_test_scores = cv_results['std_test_score']
mean_train_scores = cv_results['mean_train_score'] 
std_train_scores = cv_results['std_train_score']


# Create a dataframe to store the metrics
cv_metrics_df = pd.DataFrame({'params': params,
                              'mean_test_score': mean_test_scores,
                              'std_test_score': std_test_scores,
                              'mean_train_scores': mean_train_scores,
                              'std_train_scores': std_train_scores
                             })

# Sort the dataframe by descending mean_test_score
cv_metrics_df = cv_metrics_df.sort_values(by='mean_test_score', ascending=False).reset_index(drop=True)

# Print the dataframe
cv_metrics_df


# In[ ]:


# Check best hyperparameters
cv_metrics_df.iloc[0].values


# In[ ]:


# Store cv_metrics dataset
# cv_metrics_df.to_csv("cv_metrics_df_LightGBM_Regressor.csv", index=False)


# **TUNED LGB SCORES:**
# 
# * Train CV Score: 11.72
# * Test CV Score: 12.70
# 
# (Note: You can validate hyperparameter tuning results in the `ENSEMBLE` section, testing each pipeline separately.)

# In[ ]:


# BEST HYPERPARAMETERS FOR LGB
lgb_best_params = {
    'subsample': 0.5,
    'reg_lambda': 0.01438449888287663,
    'reg_alpha': 0.007847599703514606,
    'num_leaves': 28,
    'n_estimators': 690,
    'min_child_samples': 51, 
    'max_depth': 11,
    'learning_rate': 0.026366508987303583,
    'colsample_bytree': 0.5,
    'boosting_type': 'goss'
}


# ## XGBoost

# In[ ]:


# # Load Feature Selection object
# scores_df = pd.read_csv("feature_selection_XGBoost_Regressor_importances_scores.csv")

# # Select features set with the best score
# best_score_loc = scores_df["test_score"].idxmax()
# xgb_features_list = eval(scores_df.loc[best_score_loc, "features"])


# In[ ]:


print("Total features:", len(xgb_features_list))
print("Features:", xgb_features_list)


# In[ ]:


# XGB Regressor
model = xgb.XGBRegressor(random_state=42)

# Define hyperparameter ranges
param_distributions = {
    'n_estimators': np.arange(100, 1001, 100),  # Range from 100 to 1000 with step 100
    'learning_rate': np.linspace(0.01, 0.5, 50),  # Range from 0.01 to 0.5 with 50 values
    'max_depth': np.arange(3, 8),  # Range from 3 to 7
    'min_child_weight': np.arange(1, 6),  # Range from 1 to 5
    'subsample': np.linspace(0.6, 1.0, 5),  # Range from 0.6 to 1.0 with 5 values
    'colsample_bytree': np.linspace(0.6, 1.0, 5),  # Range from 0.6 to 1.0 with 5 values
    'gamma': np.linspace(0, 0.4, 5),  # Range from 0 to 0.4 with 5 values
    'reg_alpha': np.logspace(-2, 1, 4),  # Range from 0.01 to 10 in log scale with 4 values
    'reg_lambda': np.logspace(-2, 1, 4)  # Range from 0.01 to 10 in log scale with 4 values
}

search = RandomizedSearchCV(estimator=model,
                            param_distributions=param_distributions,
                            n_iter=100,
                            scoring='neg_root_mean_squared_error',
                            cv=cv_strategy,
                            return_train_score=True,
                            verbose=4,
                            n_jobs=-1,
                            random_state=42)

# Run the hyperparameter search
search.fit(X=full_df[xgb_features_list],
           y=full_df[TARGET])


# In[ ]:


# Access cross-validation results
cv_results = search.cv_results_

# Extract relevant metrics
params = cv_results['params']  # Hyperparameter configurations
mean_test_scores = cv_results['mean_test_score']  
std_test_scores = cv_results['std_test_score']
mean_train_scores = cv_results['mean_train_score']  
std_train_scores = cv_results['std_train_score']


# Create a dataframe to store the metrics
cv_metrics_df = pd.DataFrame({'params': params,
                              'mean_test_score': mean_test_scores,
                              'std_test_score': std_test_scores,
                              'mean_train_scores': mean_train_scores,
                              'std_train_scores': std_train_scores
                             })

# Sort the dataframe by descending mean_test_score
cv_metrics_df = cv_metrics_df.sort_values(by='mean_test_score', ascending=False).reset_index(drop=True)

# Print the dataframe
cv_metrics_df


# **TUNED XGB SCORES:**
# 
# * Train CV Score: 11.71 
# * Test CV Score: 12.69
# 
# (Note: You can validate hyperparameter tuning results in the `ENSEMBLE` section, testing each pipeline separately.)

# In[ ]:


# Check best params
cv_metrics_df.iloc[0].values


# In[ ]:


# Store cv_metrics  dataset
# cv_metrics_df.to_csv("cv_metrics_df_XGBoost_Regressor.csv", index=False)


# In[ ]:


# BEST HYPERPARAMETERS FOR XGB
xgb_best_params = {
    'subsample': 0.6,
    'reg_lambda': 10.0,
    'reg_alpha': 0.1,
    'n_estimators': 900,
    'min_child_weight': 4,
    'max_depth': 4,
    'learning_rate': 0.02, 
    'gamma': 0.30000000000000004, 
    'colsample_bytree': 1.0
}


# # 5️⃣ ENSEMBLE

# In[ ]:


print("HIST")
print("BEST FEATURES:", hist_features_list)
print("BEST PARAMS:", hist_best_params)

print()
print("LGB")
print("BEST FEATURES:", lgb_features_list)
print("BEST PARAMS:", lgb_best_params)

print()
print("XGB")
print("BEST FEATURES:", xgb_features_list)
print("BEST PARAMS:", xgb_best_params)


# In[ ]:


## ENSEMBLE PIPELINE
hist_pipeline = Pipeline([
    ('selector', ColumnTransformer([("selector", "passthrough", hist_features_list)], remainder="drop")),
    ('hist_reg',  HistGradientBoostingRegressor(**hist_best_params, random_state=42))
])

lgb_pipeline = Pipeline([
    ('selector', ColumnTransformer([("selector", "passthrough", lgb_features_list)], remainder="drop")),
    ('lgb_reg',  lgb.LGBMRegressor(**lgb_best_params, random_state=42))
])


xgb_pipeline = Pipeline([
    ('selector', ColumnTransformer([("selector", "passthrough", xgb_features_list)], remainder="drop")),
    ('xgb_reg',  xgb.XGBRegressor(**xgb_best_params, random_state=42))
])


ensemble_pipeline = VotingRegressor(
    estimators=[
        ('hist_pipeline', hist_pipeline),
        ('lgb_pipeline', lgb_pipeline),
        ('xgb_pipeline', xgb_pipeline)
    ],
    weights=[1, 1, 1]
)

ensemble_pipeline


# In[ ]:


# Run Cross-Validation
cv_scores = cross_validate(
    estimator = ensemble_pipeline, # ensemble_pipeline, hist_pipeline, lgb_pipeline, xgb_pipeline
    X = full_df[FEATURES], 
    y = full_df[TARGET],
    cv=cv_strategy, 
    scoring='neg_root_mean_squared_error',
    return_train_score=True,
    verbose=3,
    n_jobs=-1
)


# In[ ]:


# Cross Validation Scores
cv_scores_df = pd.DataFrame(cv_scores)
cv_scores_df


# In[ ]:


# Average Score
train_cv_score = cv_scores_df["train_score"].mean().round(2)
test_cv_score = cv_scores_df["test_score"].mean().round(2)

print("Train CV Score:", train_cv_score)
print("Test CV Score:", test_cv_score)


# **TUNED ENSEMBLE MODEL (HIST, LGB, XGB):**
# 
# * Train CV Score: 11.11
# * Test CV Score: **12.25** <---- Best Model 🏆

# In[ ]:


# Fit the pipeline using the full dataset
ensemble_pipeline.fit(full_df[FEATURES], full_df[TARGET])


# In[ ]:


# Save the pipeline
# joblib.dump(ensemble_pipeline, "../models/ensemble_pipeline_v2.joblib")


# # SUBMISSION FILE

# ## FEATURE ENGINEERING (TEST)

# In[ ]:


del full_df


# In[ ]:


# Load data
full_df = pd.read_csv("../data/input/test.csv")


# In[ ]:


# Check shape
full_df.shape


# In[ ]:


# Airline Encoding
full_df['Airline_enc'] = full_df['Airline'].map({'Airline A': 1, 'Airline B': 2, 'Airline C': 3})


# In[ ]:


# Extract info from "Arrival_City", "Departure_City"
prefixes = ["New", "Port", "Lake", "South", "East", "West", "North"]


def extract_prefixes(city):
    if city is np.NAN:
        return np.NAN
    else:
        ls = [prefix for prefix in prefixes if prefix in city]
        res = "-".join(ls)
        return res


# In[ ]:


# Arrival City
full_df["Arrival_City_prefix"] = full_df["Arrival_City"].apply(extract_prefixes)


# In[ ]:


# Arrival City Encoding
full_df["Arrival_City_prefix_enc"] = (full_df["Arrival_City_prefix"]
                                      .map({'':0,
                                            'New': 1, 
                                            'Port': 2, 
                                            'Lake': 3,
                                            'West':4,
                                            'East':5,
                                            'South':6,
                                            'North':7
                                           })
                                     )


# In[ ]:


# Destination City
full_df["Departure_City_prefix"] = full_df["Departure_City"].apply(extract_prefixes)


# In[ ]:


# Encoding
full_df["Departure_City_prefix_enc"] = (full_df["Departure_City_prefix"]
                                      .map({'':0,
                                            'New': 1, 
                                            'Port': 2, 
                                            'Lake': 3,
                                            'West':4,
                                            'East':5,
                                            'South':6,
                                            'North':7
                                           })
                                     )


# In[ ]:


# Arrival Time Hour
full_df["Arrival_Time_Hour"] = full_df["Arrival_Time"].apply(lambda x: x.split(":")[0]).astype(int)

# Arrival Time Number
full_df["Arrival_Time_num"] = full_df["Arrival_Time"].str.replace(":","").astype(int)

# Day or Night
full_df["Arrival_Day_Night"] = (full_df["Arrival_Time_num"] < 1_200).astype(int)


# In[ ]:


# Departure Time Hour
full_df["Departure_Time_Hour"] = full_df["Departure_Time"].apply(lambda x: x.split(":")[0]).astype(int)

# Departure Time Number
full_df["Departure_Time_num"] = full_df["Departure_Time"].str.replace(":","").astype(int)

# Day or Night
full_df["Departure_Day_Night"] = (full_df["Departure_Time_num"] < 1_200).astype(int)


# In[ ]:


# Aircraft Type
full_df['Aircraft_Type_enc'] = (full_df['Aircraft_Type']
                                .map({'Boeing 787':1, 'Airbus A320':2, 'Boeing 737':3, 
                                      'Boeing 777':4, 'Airbus A380':5})
                               )


# In[ ]:


# Day of Week
full_df['Day_of_Week_enc'] = (full_df['Day_of_Week']
                              .map({'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,
                                    'Friday': 5, 'Saturday': 6, 'Sunday': 7})
                             )


# In[ ]:


# Weekend
full_df['Weekend'] = (full_df['Day_of_Week_enc'] >= 6).astype(int)


# In[ ]:


# Month of Travel
full_df['Month_of_Travel_enc'] = (full_df['Month_of_Travel']
                                  .map({'January': 1, 'February': 2, 'March': 3, 
                                        'April': 4, 'May':5, 'June':6,
                                        'July': 7, 'August': 8, 'September': 9,
                                        'October':10, 'November':11, 'December':12})
                                 )

# Month of Travel Holiday
full_df['Month_of_Travel_Holiday_enc'] = (full_df['Month_of_Travel']
                                          .map({'January': 2, 'February': 3, 'March': 1, 
                                                'April': 2, 'May':3, 'June':1,
                                                'July': 2, 'August': 3, 'September': 1,
                                                'October':2, 'November':3, 'December':1})
                                         )


# In[ ]:


# Holiday
full_df['Holiday_Season_enc'] = (full_df['Holiday_Season']
                                  .map({'Summer': 1, 'Spring': 2, 'Fall': 3, 
                                        'Winter': 4, 'None':5})
                                 )


# In[ ]:


# Demand
full_df['Demand_enc'] = (full_df['Demand']
                                  .map({'Low': 1, 'Medium': 2, 'High': 3})
                                 )


# In[ ]:


#  Weather Conditions
full_df['Weather_Conditions_enc'] = (full_df['Weather_Conditions']
                                  .map({'Clear': 1, 'Cloudy': 2, 'Rain': 3, 'Snow':4})
                                 )


# In[ ]:


# Promotion Type
full_df['Promotion_Type_enc'] = (full_df['Promotion_Type']
                                  .map({'None': 1, 'Discount': 2, 'Special Offer': 3})
                                 )


# ## PREDICT

# In[ ]:


# Check datasets shape
full_df.shape


# In[ ]:


# Predict
preds = ensemble_pipeline.predict(full_df[FEATURES])


# In[ ]:


# Create submission file
submission_df = pd.DataFrame(data = {
    "Flight_ID": full_df["Flight_ID"].values,
    "Flight_Price": preds
})


# In[ ]:


# Check submission file
submission_df


# In[ ]:


# Check shape
submission_df.shape


# In[ ]:


# Save submission file
# submission_df.to_csv("../data/submission/submission_v2.csv", index=False)


# In[ ]:




