Trying to build data preparation pipeline. But got error message suggests that the column 'short_model' is not found in the DataFrame that I'm trying to use within pipeline. But I created it in the ShortModelTransformer() class. Maybe I missed smth in that class? Killing 3 days already trying to figure that out
class CalculateOutliers(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
q25 = X['year'].quantile(0.25)
q75 = X['year'].quantile(0.75)
iqr = q75 - q25
boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
X['year'].loc[X['year'] < boundaries[0]] = round(boundaries[0])
X['year'].loc[X['year'] > boundaries[1]] = round(boundaries[1])
return X
class ShortModelTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def short_model(self, x):
if not pd.isna(x):
return x.lower().split(' ')[0]
else:
return x
def transform(self, X):
X['short_model'] = X['model'].apply(self.short_model)
return X
class AgeCategoryTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X['age_category'] = X['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))
return X
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('encoder', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
('outliers', CalculateOutliers(), ['year']),
('short_model', ShortModelTransformer(), ['model']),
('age_category', AgeCategoryTransformer(), ['year']),
('numerical', numerical_transformer, ['odometer']),
('categorical', categorical_transformer, ['fuel', 'title_status', 'transmission',
'state', 'short_model', 'age_category'])
])