I am trying to build a linear model by using both Sklearn's linear regression and statsmodels.api.
The approach is to drop variables whose p-values and VIF values are higher than the norm (p-value : 0.05, VIF : <5)
bike_train columns are August, December, February, January, July, June, March, May, November, October, September, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday, Light Snow & Rain, Mist & Cloudy, Spring, Summer, Winter, temp, humidity, windspeed, bike_count
y_train = bike_train.pop('bike_count')
X_train = bike_train
# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 15)
rfe = rfe.fit(X_train, y_train)
#List of selected varialbles
list(zip(X_train.columns, rfe.support_, rfe.ranking_))
# Variables which have RFE support as true
col = X_train.columns[rfe.support_]
col
Output :
Index(['December', 'January', 'July', 'June', 'November', 'October',
'September', 'Sunday', 'Light Snow & Rain', 'Mist & Cloudy', 'Summer',
'Winter', 'temp', 'humidity', 'windspeed'],
dtype='object')
MODEL 1
# Creating X_train dataframe with RFE selected variables
X_train_rfe = X_train[col]
#Adding a constant
X_train_rfe = sm.add_constant(X_train_rfe)
# Running the linear model
lm = sm.OLS(y_train,X_train_rfe).fit()
#Dropping the constant
X_train_rfe = X_train_rfe.drop(['const'], axis=1)
#Summary of the linear model
print(lm.summary())
# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 2
#Dropping January
X_train_new = X_train_rfe.drop(["January"], axis = 1)
#Rebuilding the model without "January"
X_train_lm = sm.add_constant(X_train_new)
lm_new = sm.OLS(y_train,X_train_lm).fit()
X_train_lm = X_train_lm.drop(['const'], axis=1)
print(lm_new.summary())
#checking VIF for new model without January
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 3
#Dropping Humidity
X_train_new_1 = X_train_lm.drop(["humidity"], axis = 1)
#Rebuilding the model without "Humidity"
X_train_lm_1 = sm.add_constant(X_train_new_1)
lm_1 = sm.OLS(y_train,X_train_lm_1).fit()
X_train_lm_1 = X_train_lm_1.drop(['const'], axis=1)
print(lm_1.summary())
#checking VIF for new model without Humidity
vif = pd.DataFrame()
X = X_train_new_1
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 4
#Dropping Winter
X_train_new_2 = X_train_lm_1.drop(["Winter"], axis = 1)
#Rebuilding the model without "Winter"
X_train_lm_2 = sm.add_constant(X_train_new_2)
lm_2 = sm.OLS(y_train,X_train_lm_2).fit()
X_train_lm_2 = X_train_lm_2.drop(['const'], axis=1)
print(lm_2.summary())
#checking VIF for new model without Winter
vif = pd.DataFrame()
X = X_train_new_2
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 5
#Dropping June
X_train_new_3 = X_train_lm_2.drop(["June"], axis = 1)
#Rebuilding the model without "June"
X_train_lm_3 = sm.add_constant(X_train_new_3)
lm_3 = sm.OLS(y_train,X_train_lm_3).fit()
X_train_lm_3 = X_train_lm_3.drop(['const'], axis=1)
print(lm_3.summary())
#checking VIF for new model without June
vif = pd.DataFrame()
X = X_train_new_3
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
MODEL 6
#Dropping July
X_train_new_4 = X_train_lm_3.drop(["July"], axis = 1)
#Rebuilding the model without "July"
X_train_lm_4 = sm.add_constant(X_train_new_4)
lm_4 = sm.OLS(y_train, X_train_lm_4).fit()
X_train_lm_4 = X_train_lm_4.drop(['const'], axis=1)
print(lm_4.summary())
#checking VIF for new model without July
vif = pd.DataFrame()
X = X_train_new_4
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
y_train_pred = lm_4.predict(X_train_lm_4)
Error :
ValueError Traceback (most recent call last)
<ipython-input-38-f48f554d210b> in <module>
----> 1 y_train_pred = lm_4.predict(X_train_lm_4)
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\base\model.py in predict(self, exog, transform, *args, **kwargs)
1097 exog = np.atleast_2d(exog) # needed in count model shape[1]
1098
-> 1099 predict_results = self.model.predict(self.params, exog, *args,
1100 **kwargs)
1101
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in predict(self, params, exog)
378 exog = self.exog
379
--> 380 return np.dot(exog, params)
381
382 def get_distribution(self, params, scale, exog=None, dist_class=None):
<__array_function__ internals> in dot(*args, **kwargs)
ValueError: shapes (510,10) and (11,) not aligned: 10 (dim 1) != 11 (dim 0)
Scaled all the Numerical values before creating the model as follows:
scaler = MinMaxScaler()
num_vars=['temp','humidity','windspeed','bike_count']
bike_train[num_vars] = scaler.fit_transform(bike_train[num_vars])
bike_train.head()
Please tell me where have I done wrong, Thank you in advance!!