How do I do nested cross validation with sequentialfeatureselector?

239 views Asked by At

I have the following snippet I've written for a nested cross validation loop, but I'm confused how I would incorporate sequentialFeatureSelector into the mix as it has it's own CV statement. I'm thinking I need to do something similar to the references of "space = dict()" in https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/ or better yet, how would I use it with nested_cv

# configure the cross-validation procedure
outer_k = 10
inner_k = 10
random_st = sample(list(np.arange(0,10,1)),1)[0]
#print(random_st)

cv_inner = KFold(n_splits=inner_k, shuffle=True, random_state=random_st)
cv_outer = KFold(n_splits=outer_k, shuffle=True, random_state=random_st+1)

outer_results = []
for outer_train_ix, outer_test_ix in cv_outer.split(X.index):

    inner_results = []
    for inner_train_ix, inner_test_ix in cv_outer.split(outer_train_ix):
        print("inner_train_ix", inner_train_ix)
        print("inner_test_ix",inner_test_ix)
        
        #inner_results.append(inner_errors)
        
    #best_model parms selected from the loop above
    
    #best_model fitted to outer_train_ix, and out of sample errors are derived from outer_test_ix
       
    print("outer_train_ix",outer_train_ix)
    print("outer_test_ix",outer_test_ix)
    #outer_results.append(outer_errors)
        
#model that performed best on the outer (out of sample) forecasts is selected        
1

There are 1 answers

0
thistleknot On

I believe I figured it out. I was misunderstanding that the model is SFS and the features are not to be mixed up with the parameter's. Nested CV would be used for comparing models with each other and then picking the best model. Therefore since there is only 1 model. I'm simply deriving it's nested CV error score.

https://gist.github.com/thistleknot/3a46e8a9cba8067ea7061828dbe31e8d

data = pd.read_csv('C:\\Users\\User\\Documents\\wiki\\wiki\\dev\\python\\Python-Stock\\data\\raw\\states.csv')
independent = 'Poverty'

y = data[[independent]]
X = data.loc[:, ~data.columns.isin([independent,'States'])].copy()

# configure the cross-validation procedure
outer_k = 10
inner_k = 10
random_st = sample(list(np.arange(0,10,1)),1)[0]
#print(random_st)

cv_inner = KFold(n_splits=inner_k, shuffle=True, random_state=random_st)
cv_outer = KFold(n_splits=outer_k, shuffle=True, random_state=random_st+1)

outer_results = []
client = Client('192.168.3.100:8786',timeout=3)

for outer_train_ix, outer_test_ix in cv_outer.split(X.index):

    inner_results = []

    for inner_train_ix, inner_test_ix in cv_inner.split(outer_train_ix):

        #print("inner_train_ix",inner_train_ix)
        #print("inner_test_ix",inner_test_ix)

        X_train_inner = X.iloc[inner_train_ix]
        X_test_inner = X.iloc[inner_test_ix]
        y_train_inner = y.iloc[inner_train_ix]
        y_test_inner = y.iloc[inner_test_ix]

        """
        # define search space
        space = dict()
        space['n_estimators'] = [10, 100, 500]
        space['max_features'] = [2, 4, 6]
        """

        lm = LinearRegression()

        sfs1 = SFS(estimator=lm, 
                   k_features=(1, len(X.columns)),
                   forward=True, 
                   floating=False, 
                   n_jobs=-1,
                   scoring='neg_mean_squared_error',
                   cv=cv_inner)         

        with joblib.parallel_backend('dask'):
            sfs1.fit(X_train_inner, y_train_inner)

        #plt.plot(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score'])
        #plt.show()    

        best_features = pd.DataFrame(sfs1.get_metric_dict()).T['feature_names'].iloc[np.argmin(abs(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score']))]
        #print(len(best_features),best_features)
        best_score = pd.DataFrame(sfs1.get_metric_dict()).T['avg_score'].iloc[np.argmin(abs(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score']))]
        #print("best_score", best_score)

        inner_results.append([best_features,best_score])

    best_subset = pd.DataFrame(inner_results).iloc[:,0][np.argmin(pd.DataFrame(inner_results).iloc[:,1])]

    outer_model = LinearRegression()

    outer_model.fit(X.iloc[outer_train_ix], y.iloc[outer_train_ix])

    y_pred = outer_model.predict(X.iloc[outer_test_ix])

    outer_results.append(np.mean(abs(y_pred.ravel()-y.iloc[outer_test_ix].values.ravel())))
    #print("outer_train_ix",outer_train_ix)
    print("outer_test_ix",outer_test_ix)

#model that performed best on the outer (out of sample) forecasts is selected        2
print()
print("outer cv abs error mean: ",np.mean(outer_results))
print("outer cv abs error std: ",np.std(outer_results))

print("Final Model")
final_model = LinearRegression()

sfs2 = SFS(estimator=lm, 
           k_features=(1, len(X.columns)),
           forward=True, 
           floating=False, 
           n_jobs=-1,
           scoring='neg_mean_squared_error',
           cv=cv_outer)   

with joblib.parallel_backend('dask'):
    sfs2.fit(X,y)
    
plt.plot(pd.DataFrame(sfs1.get_metric_dict()).T['avg_score'])
plt.show()
best_features = pd.DataFrame(sfs2.get_metric_dict()).T['feature_names'].iloc[np.argmin(abs(pd.DataFrame(sfs2.get_metric_dict()).T['avg_score']))]
print(len(best_features),best_features)
best_score = pd.DataFrame(sfs2.get_metric_dict()).T['avg_score'].iloc[np.argmin(abs(pd.DataFrame(sfs2.get_metric_dict()).T['avg_score']))]
print("best_score", best_score)

import statsmodels.api as sm

#define response variable
#add constant to predictor variables
x = sm.add_constant(X[np.array(best_features)])

#fit linear regression model
model = sm.OLS(y, x).fit()

#view model summary
print(model.summary())