I am fitting a imblearn pipeline using Random undersampler, Feature selector and MLPClassifier on my data in databricks. Is there a way to know how much time will it take for fit to complete?
data specs - 10M rows, 5K unique features trimmed down with selectKBest to below parameter grid. Total 108 fits as per gridSearch.
param_grid = [
{
'fs__k' : [500,1000,1500],
'mlp__hidden_layer_sizes': [(100,50,25,10,5),(200,50,10,5,2)],
'mlp__activation': ['relu'],
'mlp__solver': ['adam'],
'mlp__alpha': np.logspace(-1, 3, 6)
}
]
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
print("Creation timestamp: {}".format(experiment.creation_time))
with mlflow.start_run(run_name=RUN_NAME):
#define content in pipe
random_state = 45
rus = RandomUnderSampler(sampling_strategy='majority',replacement=False ,random_state=random_state)
fs = SelectKBest(score_func=mutual_info_classif)
mlp = MLPClassifier(early_stopping=True, verbose=True)
basepipe = Pipeline([('rus',rus),('fs',fs),('mlp',mlp)], verbose=True)
sampled_data, sampled_y_train = basepipe.named_steps['rus'].fit_resample(X=x_train_transformed,y=y_train)
sampled_data = basepipe.named_steps['fs'].fit_transform(X=sampled_data,y=sampled_y_train)
#test_data
sampled_data_test = basepipe.named_steps['fs'].transform(X=x_test_transformed)
#fit
mlp_grid = GridSearchCV(basepipe, param_grid=param_grid, scoring=scorer, n_jobs=-1, cv=3,verbose=5)
print("Fitting the Grid and Estimator")
mlp_grid.fit(X=x_train_transformed,y=y_train)
#predict
print("Prediction on test set with best fitted estimator")
y_pred_mlp = mlp_grid.predict(x_test_transformed)
#score
class_report_mlp = metrics.classification_report(y_true=y_test, y_pred=y_pred_mlp)
print("Classification Report MLP:\n", class_report_mlp)
best_score_mlp = metrics.precision_score(y_test, y_pred_mlp, labels = ['0'], pos_label='0')
print("Best Score MLP :\n", best_score_mlp, '\n', "*"*80)
#mlflow log
mlflow.log_param('training data shape',x_train_transformed.shape)
mlflow.log_param('test data shape',x_test_transformed.shape)
mlflow.log_param('cat_features', CAT_COL)
mlflow.log_param('num_features', NUM_COL)
mlflow.log_param('multi_cat_features', MULTI_LABEL_COL)
mlflow.log_param('best_features_no', mlp_grid.best_params_['fs__k'])
mlflow.log_param('best_layer', mlp_grid.best_params_['mlp__hidden_layer_sizes'])
mlflow.log_param('best_activation', mlp_grid.best_params_['mlp__activation'])
mlflow.log_param('best_solver', mlp_grid.best_params_['mlp__solver'])
mlflow.log_param('best_alpha', mlp_grid.best_params_['mlp__alpha'])
mlflow.log_metric('best_precision_minority_class', best_score_mlp)
mlflow.sklearn.log_model(mlp_grid.best_estimator_,"best_model")
mlflow.end_run()