scikit-learn pipeline returning all nan values

630 views Asked by At

I am fairly new to python and am trying to teach myself how to work with pipelines for feature preprocessing and model fitting. I tried to preprocess my data (drop features that may be constants under cv sampling and then scale them) and then fit a Cox PH model using elastic net for feature selection. I want to tune the parameters of the elastic net. However, I keep gettin that the score is nan for all models. When I fit one of these models 'by hand' it works fine, so I figure there must be something wrong in the way that I'm setting up the pipeline.

from sksurv.datasets import load_breast_cancer
X, y = load_breast_cancer()
cat_features = ["er", "grade"] # categorical features I want to OneHotEncode
num_features = np.setdiff1d(X.columns, cat_features).tolist() # num features to scale
event_column = 'e.tdm'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sksurv.linear_model import CoxnetSurvivalAnalysis

# Create numeric preprocessing pipeline.
numeric_preprocess = Pipeline(steps=[
    ('vt0', VarianceThreshold()), # if I end up with a constant under cv, drop it.
    ('scale', StandardScaler()) # scale any feature that is not constant.
])

# Create categorical preprocessing pipeline.
categorical_preprocess = Pipeline(steps=[
    ('vt0', VarianceThreshold()),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Combine these two pipelines into one.
full_preprocessor = ColumnTransformer(transformers=[
    ('number', numeric_preprocess, num_features),
    ('category', categorical_preprocess, cat_features)
])

# Instantiate model
cph = CoxnetSurvivalAnalysis()

# Combine the preprocessing the model fit in a pipeline
coxnet_pipe = Pipeline(steps = [
    ('preprocess', full_preprocessor),
    ('model', cph)
])
# figure out what the names are for the parameter grid
coxnet_pipe.get_params().keys()

params = {
    'preprocess__number__scale': [StandardScaler, RobustScaler, MinMaxScaler],
    'model__alphas': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0], 
    'model__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

from sklearn.model_selection import GridSearchCV, KFold
cv = KFold(n_splits=2, shuffle=True, random_state=0)
gcv = GridSearchCV(coxnet_pipe,
    param_grid = params, cv=cv, verbose = 3)
gcv.fit(X,y)

Then I get the following:

/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning: 
486 fits failed out of a total of 486.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
486 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 675, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py", line 606, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py", line 434, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/base.py", line 852, in fit_transform
    return self.fit(X, **fit_params).transform(X)
AttributeError: 'numpy.ndarray' object has no attribute 'fit'

  warnings.warn(some_fits_failed_message, FitFailedWarning)
/opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan]
  warnings.warn(
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [197], in <cell line: 5>()
      2 cv = KFold(n_splits=2, shuffle=True, random_state=0)
      3 gcv = GridSearchCV(coxnet_pipe,
      4     param_grid = params, cv=cv, verbose = 3)
----> 5 gcv.fit(X,y)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/model_selection/_search.py:926, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    924 refit_start_time = time.time()
    925 if y is not None:
--> 926     self.best_estimator_.fit(X, y, **fit_params)
    927 else:
    928     self.best_estimator_.fit(X, **fit_params)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
    364 """Fit the model.
    365 
    366 Fit all the transformers one after the other and transform the
   (...)
    387     Pipeline with fitted steps.
    388 """
    389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
    391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392     if self._final_estimator != "passthrough":

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
    346     cloned_transformer = clone(transformer)
    347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
    349     cloned_transformer,
    350     X,
    351     y,
    352     None,
    353     message_clsname="Pipeline",
    354     message=self._log_message(step_idx),
    355     **fit_params_steps[name],
    356 )
    357 # Replace the transformer of the step with the fitted
    358 # transformer. This is necessary when loading the transformer
    359 # from the cache.
    360 self.steps[step_idx] = (name, fitted_transformer)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    348 def __call__(self, *args, **kwargs):
--> 349     return self.func(*args, **kwargs)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891 with _print_elapsed_time(message_clsname, message):
    892     if hasattr(transformer, "fit_transform"):
--> 893         res = transformer.fit_transform(X, y, **fit_params)
    894     else:
    895         res = transformer.fit(X, y, **fit_params).transform(X)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py:675, in ColumnTransformer.fit_transform(self, X, y)
    672 self._validate_column_callables(X)
    673 self._validate_remainder(X)
--> 675 result = self._fit_transform(X, y, _fit_transform_one)
    677 if not result:
    678     self._update_fitted_transformers([])

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py:606, in ColumnTransformer._fit_transform(self, X, y, func, fitted, column_as_strings)
    600 transformers = list(
    601     self._iter(
    602         fitted=fitted, replace_strings=True, column_as_strings=column_as_strings
    603     )
    604 )
    605 try:
--> 606     return Parallel(n_jobs=self.n_jobs)(
    607         delayed(func)(
    608             transformer=clone(trans) if not fitted else trans,
    609             X=_safe_indexing(X, column, axis=1),
    610             y=y,
    611             weight=weight,
    612             message_clsname="ColumnTransformer",
    613             message=self._log_message(name, idx, len(transformers)),
    614         )
    615         for idx, (name, trans, column, weight) in enumerate(transformers, 1)
    616     )
    617 except ValueError as e:
    618     if "Expected 2D array, got 1D array instead" in str(e):

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:1043, in Parallel.__call__(self, iterable)
   1034 try:
   1035     # Only set self._iterating to True if at least a batch
   1036     # was dispatched. In particular this covers the edge
   (...)
   1040     # was very quick and its callback already dispatched all the
   1041     # remaining jobs.
   1042     self._iterating = False
-> 1043     if self.dispatch_one_batch(iterator):
   1044         self._iterating = self._original_iterator is not None
   1046     while self.dispatch_one_batch(iterator):

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
    859     return False
    860 else:
--> 861     self._dispatch(tasks)
    862     return True

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:779, in Parallel._dispatch(self, batch)
    777 with self._lock:
    778     job_idx = len(self._jobs)
--> 779     job = self._backend.apply_async(batch, callback=cb)
    780     # A job can complete so quickly than its callback is
    781     # called before we get here, causing self._jobs to
    782     # grow. To ensure correct results ordering, .insert is
    783     # used (rather than .append) in the following line
    784     self._jobs.insert(job_idx, job)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
    569 def __init__(self, batch):
    570     # Don't delay the application, to avoid keeping the input
    571     # arguments in memory
--> 572     self.results = batch()

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:262, in BatchedCalls.__call__(self)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/joblib/parallel.py:262, in <listcomp>(.0)
    258 def __call__(self):
    259     # Set the default nested backend to self._backend but do not set the
    260     # change the default number of processes to -1
    261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262         return [func(*args, **kwargs)
    263                 for func, args, kwargs in self.items]

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/utils/fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
    214 def __call__(self, *args, **kwargs):
    215     with config_context(**self.config):
--> 216         return self.function(*args, **kwargs)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891 with _print_elapsed_time(message_clsname, message):
    892     if hasattr(transformer, "fit_transform"):
--> 893         res = transformer.fit_transform(X, y, **fit_params)
    894     else:
    895         res = transformer.fit(X, y, **fit_params).transform(X)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
    432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    433 if hasattr(last_step, "fit_transform"):
--> 434     return last_step.fit_transform(Xt, y, **fit_params_last_step)
    435 else:
    436     return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)

File /opt/anaconda3/envs/Python_current/lib/python3.9/site-packages/sklearn/base.py:852, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    848 # non-optimized default implementation; override when a better
    849 # method is possible for a given clustering algorithm
    850 if y is None:
    851     # fit method of arity 1 (unsupervised transformation)
--> 852     return self.fit(X, **fit_params).transform(X)
    853 else:
    854     # fit method of arity 2 (supervised transformation)
    855     return self.fit(X, y, **fit_params).transform(X)

AttributeError: 'numpy.ndarray' object has no attribute 'fit'
1

There are 1 answers

0
Ben Reiniger On BEST ANSWER

You just need some parentheses to instantiate your scalers in the hyperparameter space definition:

    'preprocess__number__scale': [StandardScaler, RobustScaler, MinMaxScaler],

to

    'preprocess__number__scale': [StandardScaler(), RobustScaler(), MinMaxScaler()],

The problem is that the methods take self as first argument, and without an instance to use, the positional argument X is handed over as the self argument, hence the final line in the traceback, self.fit(...), complains because self is actually a numpy array.