Cannot log lightGBM parameter using log_params in mlflow/hyperopt

168 views Asked by At

I'm using hyperopt to optimize hyperparameter of lightGBM. The code I use are shown below. I'm trying to log hyperparameters using log_params() in the objective function.

from sklearn.metrics import f1_score
import lightgbm as lgbm
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval, Trials, SparkTrials
from hyperopt.pyll.base import scope 
import mlflow


lgbm_space = {
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
        'n_estimators': hp.choice('n_estimators', np.arange(400, 1000, 50, dtype=int)), 
        'learning_rate' : hp.quniform('learning_rate', 0.02, 0.5, 0.02), 
        'max_depth': scope.int(hp.quniform('max_depth', 2, 16, 1)),
        'num_leaves': hp.choice("num_leaves", np.arange(10, 80, 5, dtype=int)),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
        'subsample': hp.uniform('subsample', 0.7, 1.0), 
        'min_child_samples': hp.choice('min_child_samples', np.arange(10, 50, 5, dtype=int))

}

search_space = lgbm_space
run_name = "run_optimization" 
max_eval = 100

#define objective function
def objective (search_space):
    model = lgbm.LGBMClassifier( **search_space, class_weight='balanced', n_jobs=-1, random_state=123 )      
    model.fit(X_train, y_train,            
           eval_set= [ ( X_val, y_val) ], 
           early_stopping_rounds= 10, 
           verbose=False)    
    y_pred = model.predict_proba(X_val)[:,1]   
    f1 = f1_score(y_val, (y_pred>0.5).astype(int) )
    mlflow.log_metric('f1 score', f1)
    mlflow.log_params(search_space)
    score = 1 - f1
    
    return {'loss': score, 'status': STATUS_OK, 'model': model, 'params': search_space}

spark_trials = Trials()
with mlflow.start_run(run_name = run_name):
    best_params = hyperopt.fmin(
                    fn = objective,
                    space = search_space,
                    algo = tpe.suggest,
                    max_evals = max_eval, 
                    trials = spark_trials )

I got some error messages like below:

INVALID_PARAMETER_VALUE: Parameter with key colsample_bytree was already logged with a value of 0.9523828639856076. The attempted new value was 0.7640043300157543

I'm not sure what I did wrong.

1

There are 1 answers

0
Kinjal On BEST ANSWER

Added the with mlflow.start_run(nested=True): within the objective function. There was also an issue raised for this here. Now the code creates separate folders for each evaluation containing the params and metric.

import numpy as np
from sklearn.metrics import f1_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval, Trials, SparkTrials
from hyperopt.pyll.base import scope 
import mlflow

iris = load_iris()
X_train, X_val, y_train, y_val = train_test_split(iris.data, iris.target, stratify=iris.target)


lgbm_space = {
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
        'n_estimators': hp.choice('n_estimators', np.arange(400, 1000, 50, dtype=int)), 
        'learning_rate' : hp.quniform('learning_rate', 0.02, 0.5, 0.02), 
        'max_depth': scope.int(hp.quniform('max_depth', 2, 16, 1)),
        'num_leaves': hp.choice("num_leaves", np.arange(10, 80, 5, dtype=int)),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
        'subsample': hp.uniform('subsample', 0.7, 1.0), 
        'min_child_samples': hp.choice('min_child_samples', np.arange(10, 50, 5, dtype=int))

}

search_space = lgbm_space
run_name = "run_optimization" 
max_eval = 2

#define objective function
def objective (search_space):
    model = lgbm.LGBMClassifier( **search_space, class_weight='balanced', n_jobs=-1, random_state=123 )
    callbacks = [lgbm.early_stopping(2, verbose=-10), lgbm.log_evaluation(period=0)]
    with mlflow.start_run(nested=True):
        model.fit(X_train, y_train,
                  eval_set= [(X_val, y_val)],
                  callbacks = callbacks
                  # early_stopping_rounds= 10, 
               # verbose=False
                 )    
        y_pred = model.predict_proba(X_val)[:,1]   
        f1 = f1_score(y_val, (y_pred>0.5).astype(int), average='weighted')
        mlflow.log_metric('f1 score', f1)
        score = 1 - f1
        mlflow.log_params(search_space)

    return {'loss': score, 'status': STATUS_OK, 'model': model} #'params': search_space}

spark_trials = Trials()
with mlflow.start_run(run_name = run_name, nested=True):
    best_params = hyperopt.fmin(
                    fn = objective,
                    space = search_space,
                    algo = tpe.suggest,
                    max_evals = max_eval, 
                    trials = spark_trials)
print("Best value found: ", best_params)