python random search with pipeline and lasso features selection

60 views Asked by At

I am a python beginner and I can't figure out how to solve this by myself. Here is my code with _preprocess_X and get_interesting_genes my own functions whose working, no issue with them. Unfortunatly the scoring always return nan due to the lasso selection. BUT the second version bellow is working great without the lasso selection and the label encodeur

I guess it is because pipe as no pipe.classes_ attribute with the lasso selection which is why I inputted directly the classes names in the fit function pour the first version.

I would love some help on this, I'm new to pipelines and here I am compltetly lost.

import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score

def _preprocess_X(X_sparse, gene_numbers):
    X = X_sparse.toarray()
    X = X[:, gene_numbers]
    adata = sc.AnnData(X)
    sc.pp.log1p(adata)
    sc.pp.normalize_total(adata, exclude_highly_expressed =False)
    # sc.pp.log1p(adata)
    return adata.X


def get_interesting_genes(X_sparse,nb_genes):
    #print("Dans get_interesting_genes",type(X_sparse))
    X = X_sparse.toarray()
    adata = sc.AnnData(X)
    sc.pp.log1p(adata)
    
    sc.pp.highly_variable_genes(adata, n_top_genes=nb_genes, flavor='seurat')
    selected_genes = adata.var_names[adata.var['highly_variable']]
    gene_numbers = [int(gene) for gene in selected_genes]
    # gene_numbers=[i for i in range(1,3000)]
    return gene_numbers


from sklearn.base import BaseEstimator, ClassifierMixin

#VERSION 1

from sklearn.base import BaseEstimator, ClassifierMixin
class Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self,nb_genes=4000, n_estimators=100,alpha=0.05, criterion='entropy', max_depth=None):
        self.n_estimators=n_estimators
        self.criterion=criterion
        self.max_depth=max_depth
        self.alpha=alpha
        self.pipe = make_pipeline(
            SelectFromModel(Lasso(alpha=self.alpha)),
            RandomForestClassifier(
                max_depth=self.max_depth, n_estimators=self.n_estimators, max_features="log2", 
                class_weight="balanced", criterion=self.criterion,bootstrap=False
            ),
        )
        self.interesting_genes = []
        self.label_encoder = LabelEncoder()
        self.nb_genes= nb_genes
    def fit(self, X_sparse, y):
        y=self.label_encoder.fit_transform(y)
        self.interesting_genes = get_interesting_genes(X_sparse,self.nb_genes)
        gene_numbers = self.interesting_genes
        X = _preprocess_X(X_sparse, gene_numbers)
        self.pipe.fit(X, y)
        self.classes_ = np.array(['Cancer_cells','NK_cells','T_cells_CD4+','T_cells_CD8+'])
    def predict(self, X_sparse):
        gene_numbers = self.interesting_genes
        X = _preprocess_X(X_sparse, gene_numbers)
        return self.pipe.predict_proba(X)

    def get_params(self, deep=True):
        return {'n_estimators' : self.n_estimators,
                'criterion': self.criterion, 'max_depth' : self.max_depth, 
                'alpha' : self.alpha, 'nb_genes':self.nb_genes,}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self
param_grid = {
    'alpha' : [0.5],
    'criterion' : ['entropy'],
    'n_estimators' : [200],
    'max_depth' : [4],
    #'bootstrap' : [True, False],
    'nb_genes':[13000]
}
custom_clf = Classifier()
random_search = RandomizedSearchCV(custom_clf, param_grid, cv=5, scoring='balanced_accuracy', n_iter=100, verbose=3)
random_search.fit(X_train, y_train)```


#VERSION 2
from sklearn.base import BaseEstimator, ClassifierMixin
class Classifier(BaseEstimator, ClassifierMixin):
    def __init__(self,nb_genes=4000, n_estimators=100,alpha=0.05, criterion='entropy', max_depth=None):
        self.n_estimators=n_estimators
        self.criterion=criterion
        self.max_depth=max_depth
        self.alpha=alpha
        self.pipe = make_pipeline(
            #SelectFromModel(Lasso(alpha=self.alpha)),
            RandomForestClassifier(
                max_depth=self.max_depth, n_estimators=self.n_estimators, max_features="log2", 
                class_weight="balanced", criterion=self.criterion,bootstrap=False
            ),
        )
        self.interesting_genes = []
        #self.label_encoder = LabelEncoder()
        self.nb_genes= nb_genes
    def fit(self, X_sparse, y):
        #y=self.label_encoder.fit_transform(y)
        self.interesting_genes = get_interesting_genes(X_sparse,self.nb_genes)
        gene_numbers = self.interesting_genes
        X = _preprocess_X(X_sparse, gene_numbers)
        self.pipe.fit(X, y)
        self.classes_ = self.pipe.classes_
    def predict(self, X_sparse):
        gene_numbers = self.interesting_genes
        X = _preprocess_X(X_sparse, gene_numbers)
        return self.pipe.predict_proba(X)

    def get_params(self, deep=True):
        return {'n_estimators' : self.n_estimators,
                'criterion': self.criterion, 'max_depth' : self.max_depth, 
                'alpha' : self.alpha, 'nb_genes':self.nb_genes,}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self
param_grid = {
    'alpha' : [0.5],
    'criterion' : ['entropy'],
    'n_estimators' : [200],
    'max_depth' : [4],
    #'bootstrap' : [True, False],
    'nb_genes':[13000]
}
custom_clf = Classifier()
random_search = RandomizedSearchCV(custom_clf, param_grid, cv=5, scoring='balanced_accuracy', n_iter=100, verbose=3)
random_search.fit(X_train, y_train)

Unfortunatly I can't provide anything for X_train and y_train but y train is type <class 'pandas.core.arrays.categorical.Categorical'> with value in ['Cancer_cells','NK_cells','T_cells_CD4+','T_cells_CD8+'] and X_train is a sparse numeric array with genes expression

0

There are 0 answers