Trying to use the multiprocessing library in Python but I am running into issues where it freezes but throws no error

42 views Asked by At

So I've written this code after consulting chatgpt and it works for the most part:

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from multiprocessing import Pool, cpu_count

def evaluate_subset(model,scoring, X_in, y_in, subset = None):
    #model = RandomForestClassifier(n_estimators=100, random_state=42)
    list_scores = []
    for train_index, test_index in skf.split(X_in, y_in):
        X_train, y_train = X_in.values[train_index], y_in.values[train_index]
        X_test,  y_test  = X_in.values[test_index],  y_in.values[test_index]
        model.fit(X_train[:, subset], y_train)
        y_pred = model.predict(X_test[:, subset])
        list_scores.append(scoring(y_test, y_pred))
    return np.mean(list_scores)

def stepwise_add_selection(model, scoring, X_in, y_in, n_processes=None):
    if n_processes is None:
        n_processes = cpu_count()

    pool = Pool(processes=n_processes)
    remaining_features = set(range(X_in.shape[1]))
    selected_features = []
    best_accuracy = 0
    while remaining_features:
        results = []
        for feature in remaining_features:
            subset = selected_features + [feature]
            results.append(pool.apply_async(evaluate_subset, args=(model, scoring, X_in, y_in, subset)))
        accuracies = [res.get() for res in results]
        best_index = np.argmax(accuracies)
        print("Current Best")
        print(max(accuracies))
        print("Previous Best")
        print(best_accuracy)
        print(selected_features)
        if best_accuracy < max(accuracies):
            selected_features.append(list(remaining_features)[(best_index)])
            best_accuracy = accuracies[best_index]
        else:
            break
        
    pool.close()
    pool.join()

    return selected_features, best_accuracy

However, I am trying to create another greedy search which removes features:


def stepwise_feature_removal(model, scoring, X_in, y_in, n_processes=None):
    remaining_features = set(range(X_train.shape[1]))
    selected_features = list(remaining_features)
    best_accuracy = evaluate_subset(model, scoring, X_in, y_in, selected_features)
    print("Initial accuracy score:", best_accuracy)
    while remaining_features:
        results = []
        worst_feature = None
        pool = Pool(processes=n_processes)
        for feature in remaining_features:
            temp_features = selected_features[:]
            temp_features.remove(feature)
            results.append(pool.apply_async(evaluate_subset, args=(model, scoring, X_in, y_in, temp_features)))
        pool.close()
        pool.join()
        accuracies = [res.get() for res in results]
        best_index = np.argmax(accuracies)
        if accuracies[best_index] > best_accuracy:
            best_accuracy = accuracies[best_index]
            worst_feature = temp_features[best_index]
        print("Current Best")
        print(accuracy)
        print("Previous Best")
        print(best_accuracy)
        print("Feature removed:")
        print(worst_feature)

        if worst_feature is not None:
            selected_features.remove(worst_feature)
            remaining_features.remove(worst_feature)
        else:
            break

    return selected_features, best_accuracy

In the feature removal approach the issue I am running into is that the program just stops running. It does not give an indication that there is an error or anything. I added both pool.close() and pool.join() but its not fixing the issue.

Thanks ahead of time.

I am trying to write a greedy feature reduction function that works similar to the greedy feature addition function. Not sure why it is freezing so that would be helpful as well.

Edit: I should've clarified the issue occurs when I run this code with imblearn packages. Without imblearn the multiprocessing can occur and the program runs.

def use_pipeline(clf, resample = False):
    if resample == False:
        pipe = make_pipeline(MinMaxScaler(), clf)
    else:
        pipe = make_pipeline(resample, MinMaxScaler(), clf)
    return pipe
sm  = SMOTE (random_state=38)
pipe_clf = use_pipeline(clf1, sm)
stepwise_feature_removal(pipe_clf, matthews_corrcoef, X_train, y_train, 15)
0

There are 0 answers