Hi I'm triying to hyperparameter tune a catboost binary classification model and to store the logs metrics and experiments with mlflow but I keep getting the following error. This is the code
search_space = {
'iterations': scope.int(hp.quniform('iterations', 100, 1000, 1)),
'learning_rate': hp.loguniform('learning_rate', -3, 0),
'depth': scope.int(hp.quniform('depth', 4, 15, 1)),
'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 1, 10),
'subsample': hp.uniform('subsample', 0.5, 1.0),
'random_seed': 123,
}
def train_model(params):
with mlflow.start_run(experiment_id=EXPERIMENT_ID, nested=True):
model = CatBoostClassifier(
l2_leaf_reg=int(params['l2_leaf_reg']),
learning_rate=params['learning_rate'],
depth=int(params['depth']),
iterations=params['iterations'],
subsample=params['subsample'],
loss_function='Logloss',
eval_metric='AUC',
early_stopping_rounds=100,
random_seed=42,
verbose=False)
model.fit(
Xtrain, ytrain,
eval_set=(X_validation, y_validation),
logging_level='Verbose' )
y_predcatboost = model.predict(X_testT)
y_pred_probacatboost = model.predict_proba(X_testT)[:, 1]
accuracycatboost = accuracy_score(y_test, y_predcatboost)
precisioncatboost = precision_score(y_test, y_predcatboost)
recallcatboost = recall_score(y_test, y_predcatboost)
f1catboost = f1_score(y_test, y_predcatboost)
roc_auccatboost = roc_auc_score(y_test, y_pred_probacatboost)
logloss_auccatboost = log_loss(y_test, y_pred_probacatboost)
validation_metrics = {
'Precision': precisioncatboost,
'Recall': recallcatboost,
'F1': f1catboost,
'AUCROC': roc_auccatboost,
'logloss':logloss_auccatboost
}
validation_metrics_values = list(validation_metrics.values())
metric_names = [ 'Precision', 'Recall', 'F1', 'AUCROC','logloss']
for name, metric in list(zip(metric_names, validation_metrics_values)):
mlflow.log_metric(f'validation_{name}', metric)
return {'status': STATUS_OK, 'AUCROC': -1*validation_metrics['AUCROC']}
spark_trials = SparkTrials(parallelism=10)
with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name='catboost_models_scaled_logloss'):
catboost_best_params = fmin(
fn=train_model,
space=search_space,
algo=tpe.suggest,
trials=spark_trials,
max_evals=50
)
This is the error
trial task 3 failed, exception is Caused by Traceback (most recent call last):
File "/databricks/spark/python/pyspark/serializers.py", line 189, in \_read_with_length
return self.loads(obj)
File "/databricks/spark/python/pyspark/serializers.py", line 541, in loads
return cloudpickle.loads(obj, encoding=encoding)
TypeError: InvalidLoss.__init__() missing 1 required positional argument: result```
pleas help me
I tried to modify the code and get my dataframes with Pool but it does not work