How to get regression quantiles with older version of xgboost (1.6.2)?

47 views Asked by At

In the new version 2.0 of XGBoost we have quantile regression.

I would like to implement quantile regression on the older version xgboost 1 using a custom function for alpha_list = [0.05, 0.5, 0.95] where 0.05 is the lower bound, 0.5 is the median and 0.05 is the upper bound. Unfortunately I can't seem to get the correct implementation for this quantile function. I am assuming quantile regression using xgboost 1 is a common issue. How would you implement these quantiles correctly?

Below I have provided my best example using a different model, I want to however I want to use a custom function being passed as the objective to xgboost==1.6.3

pip install xgboost==1.6.3

Example

import xgboost as xgb
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import logging
import time

def uncertainty_model(
    X_train=None,  
    categorical_features=None,  # List of names of the categorical features
    numeric_features=None,  # List of names of the numeric features
    y_train=None,  # The target values
    alpha_param=0.5,  # The quantile to predict. 0.5 for median, <0.5 for lower quantiles, >0.5 for upper quantiles
):
    # Log the beginning of the model fitting process, including the alpha parameter
    logging.info(f"Fitting quantile model alpha={alpha_param}")
    start_time = time.time()

    # Combine categorical and numeric features into a single list
    features = categorical_features + numeric_features
    
    X_train = X_train.copy()
    X_train[categorical_features] = X_train[categorical_features].astype("category")
    X_train[numeric_features] = X_train[numeric_features].astype("float64")

    # Define a pipeline for processing categorical features:
    categorical_transformer = make_pipeline(
        OneHotEncoder(),
        SimpleImputer(strategy="most_frequent"),
    )
    # Define a pipeline for processing numeric features:
    numeric_transformer = make_pipeline(
        StandardScaler(), SimpleImputer(strategy="median")
    )

    # Combine the transformers for numeric and categorical features using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    #Define the model to use GradientBoostingRegressor configured for quantile regression with the specified alpha parameter
    #This is where our problem is, I want to replace this model with xgboost 1.6.5's xgboost.XGBRegressor
    model = GradientBoostingRegressor(
        loss="quantile",  # Use quantile loss for quantile regression
        max_depth=5, 
        alpha=alpha_param  # The quantile to predict
    )
    # Replace GradientBoostingRegressor with xgboost.XGBRegressor for quantile regression
    
    
    pipeline = make_pipeline(preprocessor, model)
    pipeline.fit(X_train[features], y_train)

    end_time = time.time() 
    logging.info(f"Time taken: {end_time - start_time:.6f} seconds")

    # Return the fitted pipeline
    return pipeline


# Generate synthetic data
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)
X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
y_series = pd.Series(y, name='target')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_series, test_size=0.2, random_state=42)

# Assuming all features are numeric for this example
numeric_features = X_df.columns.tolist()
categorical_features = []


# List of alpha values to test
alpha_list = [0.05, 0.5, 0.95]
predictions = {} # Placeholder for predictions

# Iterate over the list of alpha values and fit a model for each
for alpha in alpha_list:
    print(f"Testing alpha={alpha}")
    pipeline = uncertainty_model(
        X_train=X_train,
        categorical_features=categorical_features,
        numeric_features=numeric_features,
        y_train=y_train,
        alpha_param=alpha
    )
    
    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate and print the Mean Absolute Error (MAE) for evaluation
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Alpha: {alpha}, MAE: {mae:.4f}\n")
    predictions[alpha] = y_pred

# Plotting the predictions
plt.figure(figsize=(10, 6))
x_axis = np.arange(len(X_test))
for alpha, pred in predictions.items():
    plt.plot(x_axis, pred, label=f'Alpha = {alpha}')

plt.title('Quantile Regression Predictions ')
plt.xlabel('Index')
plt.ylabel('Predicted Value')
plt.legend()
plt.show()
0

There are 0 answers