Value error : Shapes not aligned in Multiple linear regression model

806 views Asked by At

I am trying to build a linear model by using both Sklearn's linear regression and statsmodels.api.

The approach is to drop variables whose p-values and VIF values are higher than the norm (p-value : 0.05, VIF : <5)

bike_train columns are August, December, February, January, July, June, March, May, November, October, September, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday, Light Snow & Rain, Mist & Cloudy, Spring, Summer, Winter, temp, humidity, windspeed, bike_count

y_train = bike_train.pop('bike_count')
X_train = bike_train

# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 15)
rfe = rfe.fit(X_train, y_train)
#List of selected varialbles
list(zip(X_train.columns, rfe.support_, rfe.ranking_))
# Variables which have RFE support as true
col = X_train.columns[rfe.support_]
col

Output :

Index(['December', 'January', 'July', 'June', 'November', 'October',
       'September', 'Sunday', 'Light Snow & Rain', 'Mist & Cloudy', 'Summer',
       'Winter', 'temp', 'humidity', 'windspeed'],
      dtype='object')

MODEL 1

# Creating X_train dataframe with RFE selected variables
X_train_rfe = X_train[col]
#Adding a constant
X_train_rfe = sm.add_constant(X_train_rfe)
# Running the linear model
lm = sm.OLS(y_train,X_train_rfe).fit()
#Dropping the constant
X_train_rfe = X_train_rfe.drop(['const'], axis=1)
#Summary of the linear model
print(lm.summary())

# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

MODEL 2

#Dropping January
X_train_new = X_train_rfe.drop(["January"], axis = 1)
#Rebuilding the model without "January"
X_train_lm = sm.add_constant(X_train_new)
lm_new = sm.OLS(y_train,X_train_lm).fit()
X_train_lm = X_train_lm.drop(['const'], axis=1)
print(lm_new.summary())

#checking VIF for new model without January
vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

MODEL 3

#Dropping Humidity
X_train_new_1 = X_train_lm.drop(["humidity"], axis = 1)
#Rebuilding the model without "Humidity"
X_train_lm_1 = sm.add_constant(X_train_new_1)
lm_1 = sm.OLS(y_train,X_train_lm_1).fit()
X_train_lm_1 = X_train_lm_1.drop(['const'], axis=1)
print(lm_1.summary())

#checking VIF for new model without Humidity
vif = pd.DataFrame()
X = X_train_new_1
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

MODEL 4

#Dropping Winter
X_train_new_2 = X_train_lm_1.drop(["Winter"], axis = 1)
#Rebuilding the model without "Winter"
X_train_lm_2 = sm.add_constant(X_train_new_2)
lm_2 = sm.OLS(y_train,X_train_lm_2).fit()
X_train_lm_2 = X_train_lm_2.drop(['const'], axis=1)
print(lm_2.summary())

#checking VIF for new model without Winter
vif = pd.DataFrame()
X = X_train_new_2
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

MODEL 5

#Dropping June
X_train_new_3 = X_train_lm_2.drop(["June"], axis = 1)
#Rebuilding the model without "June"
X_train_lm_3 = sm.add_constant(X_train_new_3)
lm_3 = sm.OLS(y_train,X_train_lm_3).fit()
X_train_lm_3 = X_train_lm_3.drop(['const'], axis=1)
print(lm_3.summary())

#checking VIF for new model without June
vif = pd.DataFrame()
X = X_train_new_3
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

MODEL 6

#Dropping July
X_train_new_4 = X_train_lm_3.drop(["July"], axis = 1)
#Rebuilding the model without "July"
X_train_lm_4 = sm.add_constant(X_train_new_4)
lm_4 = sm.OLS(y_train, X_train_lm_4).fit()
X_train_lm_4 = X_train_lm_4.drop(['const'], axis=1)
print(lm_4.summary())

#checking VIF for new model without July
vif = pd.DataFrame()
X = X_train_new_4
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

y_train_pred = lm_4.predict(X_train_lm_4)

Error :

ValueError                                Traceback (most recent call last)
<ipython-input-38-f48f554d210b> in <module>
----> 1 y_train_pred = lm_4.predict(X_train_lm_4)

C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\base\model.py in predict(self, exog, transform, *args, **kwargs)
   1097             exog = np.atleast_2d(exog)  # needed in count model shape[1]
   1098 
-> 1099         predict_results = self.model.predict(self.params, exog, *args,
   1100                                              **kwargs)
   1101 

C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\regression\linear_model.py in predict(self, params, exog)
    378             exog = self.exog
    379 
--> 380         return np.dot(exog, params)
    381 
    382     def get_distribution(self, params, scale, exog=None, dist_class=None):

<__array_function__ internals> in dot(*args, **kwargs)

ValueError: shapes (510,10) and (11,) not aligned: 10 (dim 1) != 11 (dim 0)

Scaled all the Numerical values before creating the model as follows:

scaler = MinMaxScaler()
num_vars=['temp','humidity','windspeed','bike_count']
bike_train[num_vars] = scaler.fit_transform(bike_train[num_vars])
bike_train.head()

Please tell me where have I done wrong, Thank you in advance!!

0

There are 0 answers