How to find most relevant variables with PCA dimension reduction?

212 views Asked by At

I'm new to python coding, and working on a project but stuck at coding part. I have one target variable and 23 relevant variables. My dataset is 11(simples)*23(descriptors) and one target dataset 11(simples)*1(target variable). How do I find the most relevant variables from these 23 descriptors with PCA dimensional reduction?

    pca = PCA()
pca.fit(df2)
transformed = pca.transform(df2)

    from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
steps = [('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)`

    from sklearn.preprocessing import MinMaxScaler
steps = [('norm', MinMaxScaler()), ('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)

    from sklearn.datasets import make_classification
X, y = make_classification(n_samples=11, n_features=23, n_informative=5, n_redundant=18, random_state=7)
print(X.shape, y.shape)

    from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=11, n_features=23, n_informative=5, n_redundant=18, random_state=7)
steps = [('pca', PCA(n_components=4)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

enter image description heredata

enter image description here I'm supposed to get this image but I don't know how to do.

0

There are 0 answers