sklearn "ValueError: A given column is not a column of the dataframe" while trying to pass classes in the data prepocessor pipeline

76 views Asked by At

Trying to build data preparation pipeline. But got error message suggests that the column 'short_model' is not found in the DataFrame that I'm trying to use within pipeline. But I created it in the ShortModelTransformer() class. Maybe I missed smth in that class? Killing 3 days already trying to figure that out

class CalculateOutliers(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        q25 = X['year'].quantile(0.25)
        q75 = X['year'].quantile(0.75)
        iqr = q75 - q25
        boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
        X['year'].loc[X['year'] < boundaries[0]] = round(boundaries[0])
        X['year'].loc[X['year'] > boundaries[1]] = round(boundaries[1])
        return X

class ShortModelTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def short_model(self, x):
        if not pd.isna(x):
            return x.lower().split(' ')[0]
        else:
            return x

    def transform(self, X):
        X['short_model'] = X['model'].apply(self.short_model)
        return X

class AgeCategoryTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['age_category'] = X['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))
        return X

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('encoder', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('outliers', CalculateOutliers(), ['year']),
    ('short_model', ShortModelTransformer(), ['model']),
    ('age_category', AgeCategoryTransformer(), ['year']),
    ('numerical', numerical_transformer, ['odometer']),
    ('categorical', categorical_transformer, ['fuel', 'title_status', 'transmission',
                                              'state', 'short_model', 'age_category'])
])
0

There are 0 answers