Catboost hyperparameter tuning with mlflow

125 views Asked by At

Hi I'm triying to hyperparameter tune a catboost binary classification model and to store the logs metrics and experiments with mlflow but I keep getting the following error. This is the code

search_space = {
    'iterations': scope.int(hp.quniform('iterations', 100, 1000, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'depth': scope.int(hp.quniform('depth', 4, 15, 1)),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 1, 10),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'random_seed': 123,
}



def train_model(params):
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, nested=True): 

        model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        depth=int(params['depth']),
        iterations=params['iterations'],
        subsample=params['subsample'],
        loss_function='Logloss',
        eval_metric='AUC',
        early_stopping_rounds=100,
        random_seed=42,
        verbose=False)

        model.fit(
            Xtrain, ytrain,
            eval_set=(X_validation, y_validation),
            logging_level='Verbose' )

        y_predcatboost = model.predict(X_testT)
        y_pred_probacatboost = model.predict_proba(X_testT)[:, 1]
        accuracycatboost = accuracy_score(y_test, y_predcatboost)
        precisioncatboost = precision_score(y_test, y_predcatboost)
        recallcatboost  = recall_score(y_test, y_predcatboost)
        f1catboost =  f1_score(y_test, y_predcatboost)
        roc_auccatboost  = roc_auc_score(y_test, y_pred_probacatboost)
        logloss_auccatboost  = log_loss(y_test, y_pred_probacatboost)

        validation_metrics = {
         
            'Precision': precisioncatboost, 
            'Recall': recallcatboost, 
            'F1': f1catboost, 
            'AUCROC': roc_auccatboost,
            'logloss':logloss_auccatboost

        }
        validation_metrics_values = list(validation_metrics.values())
        metric_names = [ 'Precision', 'Recall', 'F1', 'AUCROC','logloss']
        
        for name, metric in list(zip(metric_names, validation_metrics_values)):
            mlflow.log_metric(f'validation_{name}', metric)

        return {'status': STATUS_OK, 'AUCROC': -1*validation_metrics['AUCROC']}
spark_trials = SparkTrials(parallelism=10)

with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name='catboost_models_scaled_logloss'):
    catboost_best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        trials=spark_trials,
        max_evals=50
    )

This is the error

trial task 3 failed, exception is Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 189, in \_read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 541, in loads
return cloudpickle.loads(obj, encoding=encoding)
TypeError: InvalidLoss.__init__() missing 1 required positional argument: result```

pleas help me

I tried to modify the code and get my dataframes with Pool but it does not work

0

There are 0 answers