So I've written this code after consulting chatgpt and it works for the most part:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from multiprocessing import Pool, cpu_count
def evaluate_subset(model,scoring, X_in, y_in, subset = None):
#model = RandomForestClassifier(n_estimators=100, random_state=42)
list_scores = []
for train_index, test_index in skf.split(X_in, y_in):
X_train, y_train = X_in.values[train_index], y_in.values[train_index]
X_test, y_test = X_in.values[test_index], y_in.values[test_index]
model.fit(X_train[:, subset], y_train)
y_pred = model.predict(X_test[:, subset])
list_scores.append(scoring(y_test, y_pred))
return np.mean(list_scores)
def stepwise_add_selection(model, scoring, X_in, y_in, n_processes=None):
if n_processes is None:
n_processes = cpu_count()
pool = Pool(processes=n_processes)
remaining_features = set(range(X_in.shape[1]))
selected_features = []
best_accuracy = 0
while remaining_features:
results = []
for feature in remaining_features:
subset = selected_features + [feature]
results.append(pool.apply_async(evaluate_subset, args=(model, scoring, X_in, y_in, subset)))
accuracies = [res.get() for res in results]
best_index = np.argmax(accuracies)
print("Current Best")
print(max(accuracies))
print("Previous Best")
print(best_accuracy)
print(selected_features)
if best_accuracy < max(accuracies):
selected_features.append(list(remaining_features)[(best_index)])
best_accuracy = accuracies[best_index]
else:
break
pool.close()
pool.join()
return selected_features, best_accuracy
However, I am trying to create another greedy search which removes features:
def stepwise_feature_removal(model, scoring, X_in, y_in, n_processes=None):
remaining_features = set(range(X_train.shape[1]))
selected_features = list(remaining_features)
best_accuracy = evaluate_subset(model, scoring, X_in, y_in, selected_features)
print("Initial accuracy score:", best_accuracy)
while remaining_features:
results = []
worst_feature = None
pool = Pool(processes=n_processes)
for feature in remaining_features:
temp_features = selected_features[:]
temp_features.remove(feature)
results.append(pool.apply_async(evaluate_subset, args=(model, scoring, X_in, y_in, temp_features)))
pool.close()
pool.join()
accuracies = [res.get() for res in results]
best_index = np.argmax(accuracies)
if accuracies[best_index] > best_accuracy:
best_accuracy = accuracies[best_index]
worst_feature = temp_features[best_index]
print("Current Best")
print(accuracy)
print("Previous Best")
print(best_accuracy)
print("Feature removed:")
print(worst_feature)
if worst_feature is not None:
selected_features.remove(worst_feature)
remaining_features.remove(worst_feature)
else:
break
return selected_features, best_accuracy
In the feature removal approach the issue I am running into is that the program just stops running. It does not give an indication that there is an error or anything. I added both pool.close() and pool.join() but its not fixing the issue.
Thanks ahead of time.
I am trying to write a greedy feature reduction function that works similar to the greedy feature addition function. Not sure why it is freezing so that would be helpful as well.
Edit: I should've clarified the issue occurs when I run this code with imblearn packages. Without imblearn the multiprocessing can occur and the program runs.
def use_pipeline(clf, resample = False):
if resample == False:
pipe = make_pipeline(MinMaxScaler(), clf)
else:
pipe = make_pipeline(resample, MinMaxScaler(), clf)
return pipe
sm = SMOTE (random_state=38)
pipe_clf = use_pipeline(clf1, sm)
stepwise_feature_removal(pipe_clf, matthews_corrcoef, X_train, y_train, 15)