How to select hyperparameters for Kernel PCA in python using hyperopt?

930 views Asked by At

I am looking into applying Kernel Principal Component Analysis (KPCA) to reduce the dimensionality of my feature matrix set to obtain a cluster of datapoints. I went through the parameters used in KPCA in scikit learn package and understood that there are some parameters that should work if one of them is selected (For instance, if gamma is selected then degree and coefficient are not used). Furthermore, I went through the following links that look into the hyperparameters method used for classification models:

I tried to code and combine the hyperopt code with KPCA, but, I keep on getting errors at the area dealing with scoring of the PCA model. I know that KPCA does not have a score in order to find the accuracy of the PCA model, so, how can I overcome this error? I tried several scoring methods and either I get an error from inverse_fit or the size of the array. Please find the code and error message below.

Code:

from sklearn.decomposition import PCA, KernelPCA, SparsePCA, IncrementalPCA
from hyperopt import hp, tpe, atpe, fmin, Trials, rand, STATUS_OK

# Implementing Hyperparamater method:
models = {'pca'      : PCA,
          'kpca'     : KernelPCA,
          'spca'     : SparsePCA,
          # 'ipca'     : IncrementalPCA
          }

def search_space(model):
    # Initialising variables:
    model = model.lower()
    space = {}

    # Calling the models:
    if model == 'pca':
        space = {'svd_solver'        : hp.choice('svd_solver', ["auto", "full", "arpack", "randomized"]),
                 }

    elif model == 'kpca':
        space = {'kernel'            : hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed']),
                 'gamma'             : hp.choice('gamma', np.arange(0.03, 0.05, 0.002)),
                 'degree'            : hp.choice('degree', range(1, 10, 1)),
                 'coef0'             : hp.choice('coef0', np.arange(1, 10, 0.2))
                 }

    elif model == 'spca':
        space = {'alpha'             : hp.choice('alpha', np.arange(1.0, 15.0, 0.2)),
                 'ridge_alpha'       : hp.choice('ridge_alpha', np.linspace(0.01, 0.3, 30)),
                 'method'            : hp.choice('method', ['lars', 'cd']),
                 'max_iter'          : hp.choice('max_iter', [1000, 1500, 2000, 2500, 3000])
                 }

    # elif model == 'ipca':
    #     space = {'batch_size'        : hp.choice('batch_size', ['gini', 'entropy']),
    #              }
    space['model'] = model
    return space

def obj_fnc(params):
    model = params.get('model').lower()
    # X_ = scale_normalize(params, X[:])
    del params['model']
    clf = models[model](**params)
    return (get_acc_status(clf, X))

def get_acc_status(clf, X):
    X_reduced = clf.fit_transform(X)
    # X_prereduced = clf.fit_inverse_transform(X_reduced)
    # acc = -1 * mean_squared_error(X, X_prereduced)
    X_prereduced = clf.inverse_transform(X_reduced)
    # acc = -1 * mean_absolute_error(X, X_prereduced)
    acc = -1 * r2_score(X, X_prereduced)
    # acc = cross_val_score(clf, X).mean()
    return {'loss': -acc, 'status': STATUS_OK}

##### Hyperparameter optimisation:
# Running Bayesian Optimisation to get the best parameters:
start = time.time()

# Create the algorithms
tpe_algo = tpe.suggest
# rand_algo = rand.suggest
# atpe_algo = atpe.suggest

# Assigning model:
model = 'kpca'

# Creating the trial objects:
hypopt_trials = Trials()

# Getting the best parameters:
best_params = fmin(obj_fnc, search_space(model), algo=tpe_algo, max_evals=500, trials=hypopt_trials)
print("Best params: ", best_params)
print('Best accuracy: ', hypopt_trials.best_trial['result']['loss'])
print("[INFO] Baye. Opt. search took {:.2f} seconds".format(time.time() - start))

# Calling parameters:
## PCA:
svd_solver = ["auto", "full", "arpack", "randomized"]
## KPCA:
kernel =  ["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"]
gamma = np.arange(0.1, 0.9, 0.01)
degree = range(1, 10, 1)
coef0 = np.arange(1, 10, 0.2)
kernel_gamma = ["poly", "rbf", "sigmoid"]
kernel_degree = "poly"
kernel_coef0 = "sigmoid"
## SPCA:
alpha = np.arange(1.0, 15.0, 0.2)
ridge_alpha = np.linspace(0.01, 0.3, 30)
method = ['lars', 'cd']
max_iter = [1000, 1500, 2000, 2500, 3000]

# Creating the PCA models:
# pca = PCA(n_components=2, svd_solver=svd_solver[best_params['svd_solver'])
if any(x in best_params for x in kernel_gamma):
    pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]))
    if any(x in best_params for x in kernel_degree):
        pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]), degree='{0}'.format(degree[best_params['degree']]), coef0='{0}'.format(coef0[best_params['coef0']]))                  
    if any(x in best_params for x in kernel_coef0):
        pca = KernelPCA(n_components=2, kernel=kernel[best_params['kernel']], gamma='{0}'.format(gamma[best_params['gamma']]), coef0='{0}'.format(coef0[best_params['coef0']]))                  
# pca = SparsePCA(n_components=2, alpha='{0}'.format(alpha[best_params['alpha']]), ridge_alpha='{0}'.format(ridge_alpha[best_params['ridge_alpha']]), method=method[best_params['method']], max_iter='{0}'.format(max_iter[best_params['max_iter']]))
# pca = IncrementalPCA(n_components=2)
print('Model: ', pca)
PrincipalComponents = pca.fit_transform(X_std)
principalDf = pd.DataFrame(data = PrincipalComponents, columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, dataframe[['Label']]], axis = 1)
print('Principal Component Analysis: ')
print(principalDf)

Error Messages:

Error message (1):

ValueError: There are significant negative eigenvalues (1.11715 of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.

Error message (2):

ValueError: Precomputed metric requires shape (n_queries, n_indexed). Got (50, 14) for 50 indexed.
0

There are 0 answers