Testing model gives error: "y contains previously unseen labels"

75 views Asked by At
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
import joblib

# Load the data
data = pd.read_csv("data.csv")

# Data preprocessing
# Convert the "eligible_date" column to datetime
data['eligible_date'] = pd.to_datetime(data['eligible_date'], format='%Y-%m-%d')

# Feature engineering
data['year'] = data['eligible_date'].dt.year
data['month'] = data['eligible_date'].dt.month
data['day'] = data['eligible_date'].dt.day
data['weekday'] = data['eligible_date'].dt.weekday

# Label encoding for the "country" and "package" columns
label_encoder = LabelEncoder()
data['country_encoded'] = label_encoder.fit_transform(data['country'])
data['package_encoded'] = label_encoder.fit_transform(data['package'])
data['model_encoded'] = label_encoder.fit_transform(data['model'])

joblib.dump(label_encoder, 'label_encoder.pkl')

# Drop unnecessary columns
data.drop(['eligible_date', 'country', 'package','model'], axis=1, inplace=True)

# Split the data into features and target
X = data.drop('payout', axis=1)
y = data['payout']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an XGBoost model with hyperparameters
model = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

joblib.dump(model, 'xgboost_model.pkl')

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# You can also use cross-validation to assess model performance
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')

print("Cross-Validation Mean Squared Error:", np.mean(cv_scores))

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 500, 1000, 1500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=kf, random_state=42)
random_search.fit(X, y)

print("Best Hyperparameters:", random_search.best_params_)

#### testing phase

loaded_label_encoder = joblib.load('label_encoder.pkl')
loaded_model = joblib.load('xgboost_model.pkl')

# Create a function to predict total payout for a given month
def predict_total_payout_for_month(target_time, model, encoder):
    # Create a sample DataFrame with one row for the specified month
    sampledata = pd.DataFrame({
        'eligible_date': pd.to_datetime([target_time], format='%Y-%m-%d'),
        #'eligible_date': [datetime.strptime(target_month, '%Y-%m')],
        'country': ['Malaysia'],  # Replace with a valid country name
        'package': ['Express 100K'],
        'model': ['Express'],# Replace with a valid package name
        'payout': [0]  # This value will be overwritten
    })
#data['eligible_date'] = pd.to_datetime(data['eligible_date'], format='%Y-%m-%d')
    # Data preprocessing
    # sampledata['month'] = sampledata['eligible_date'].dt.to_period('M')
    sampledata['year'] = sampledata['eligible_date'].dt.year
    sampledata['month'] = sampledata['eligible_date'].dt.month
    sampledata['day'] = sampledata['eligible_date'].dt.day
    sampledata['weekday'] = sampledata['eligible_date'].dt.weekday
    # Assuming "country" and "package" columns need encoding
    sampledata['country_encoded'] = encoder.fit_transform(sampledata['country'])
    sampledata['package_encoded'] = encoder.fit_transform(sampledata['package'])
    sampledata['model_encoded'] = encoder.fit_transform(sampledata['model'])
    print(sampledata)

    # Prepare the input features (X) for prediction
    X_new = sampledata[['year', 'month', 'day', 'weekday', 'country_encoded', 'package_encoded', 'model_encoded']]  # Make sure the columns match those used during training
    print(X_new)

    # Use the trained model to make predictions
    predicted_payout = model.predict(X_new)

    return predicted_payout  # Return the predicted payout for the specified month

# Specify the target month for prediction
target = '2023-11-15'  # Replace with the month you want to predict

# Use the function to make predictions
predicted_total_payout = predict_total_payout_for_month(target, loaded_model, loaded_label_encoder)

print(f"Predicted Total Payout for {target}: {predicted_total_payout}")

Running the test part the encoded values do not match. For example malaysia should be encoded as 79 but whatever country I put in the test, it encodes as 0. I am loading the labeldata pickel to transform the test data.

How can I adjust it so that the labeling from training phase is applied to the testing phase?

1

There are 1 answers

0
Marco Parola On

The problem is that you are fitting a new LabelEncoder to the test data. To apply the same encoding as in the training phase to the testing data, you should use the transform method of the LabelEncoder you used during training.

def predict_total_payout_for_month(target_time, model, encoder):
    # some code ...
    sampledata['country_encoded'] = encoder.transform(sampledata['country'])
    sampledata['package_encoded'] = encoder.transform(sampledata['package'])
    sampledata['model_encoded'] = encoder.transform(sampledata['model'])
    # some code