I have this code:
import cudf
import cuml
import cupy as cp
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from cuml.model_selection import GridSearchCV
from cuml.metrics import accuracy_score
import pandas as pandas
#---------------------------------------------------------------------------------------------------
def preparaDatos(y,pathMetadata,pathMatrizUnitigs) :
print("Preparing data for modeling...")
y_column = pandas.read_csv(pathMetadata,index_col=0,delimiter=';')
y_column = y_column.dropna(subset=[y])
y_column = y_column[y]
X = pandas.read_csv(pathMatrizUnitigs, sep="\t", index_col=0, low_memory=False)
X = X.transpose()
X = X[X.index.isin(y_column.index)]
y_column = y_column[y_column.index.isin(X.index)]
ordered_indices = y_column.index
X_ordered = X.loc[ordered_indices]
return X_ordered, y_column
#---------------------------------------------------------------------------------------------------
def main():
espacioTrabajo = ""
archUnitigs = "graph.nodes"
archMatrizUnitigs = "salida_Unitig_caller.rtab"
archMetadata = "metadata.csv"
archMatrizUnitigsFiltrada = "salida_Unitig_caller_gt90.txt"
pathArchUnitigs = espacioTrabajo + archUnitigs
pathMetadata = espacioTrabajo + archMetadata
pathArchMatrizUnitigsFiltrada = espacioTrabajo + archMatrizUnitigsFiltrada
classField="VC"
X, y = preparaDatos(classField,pathMetadata,pathArchMatrizUnitigsFiltrada)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_cudf = cudf.DataFrame(X_train)
X_test_cudf = cudf.DataFrame(X_test)
y_train_cudf = cudf.Series(y_train)
y_test_cudf = cudf.Series(y_test)
X_train_cudf = X_train_cudf.astype('float32')
y_train_cudf = y_train_cudf.astype('float32')
# Definir los parámetros a probar en la búsqueda de cuadrícula
params = {
'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 1, 1.3, 1.5, 1.7, 2.0, 3.0, 4,0, 5.0, 6.0, 7.0, 8.0, 9.0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
'kernel': ['linear', 'poly' , 'rbf' , 'sigmoid']
}
svm_model = cuml.svm.SVC(cache_size=10240)
grid_search = GridSearchCV(estimator=svm_model, param_grid=params, cv=3, n_jobs=-1)
print("Performing GRID search...")
grid_search.fit(X_train_cudf.to_numpy(), y_train_cudf.to_numpy())
best_params = grid_search.best_params_
print("Best params: ")
print(best_params)
#----------------------------------------------------------------------------------------------------
if __name__ == "__main__":
main()
When I run it, I get this error:
Traceback (most recent call last):
File ~/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/spyder_kernels/py3compat.py:356 in compat_exec
exec(code, globals, locals)
File ~/Documentos/ML/Python/Pruebas/SVM_GPU_6.py:106
main()
File ~/Documentos/ML/Python/Pruebas/SVM_GPU_6.py:85 in main
grid_search.fit(X_train_cudf.to_numpy(), y_train_cudf.to_numpy())
File ~/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/sklearn/base.py:1474 in wrapper
return fit_method(estimator, *args, **kwargs)
File ~/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/sklearn/model_selection/_search.py:970 in fit
self._run_search(evaluate_candidates)
File ~/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/sklearn/model_selection/_search.py:1527 in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File ~/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/sklearn/model_selection/_search.py:947 in evaluate_candidates
_warn_or_raise_about_fit_failures(out, self.error_score)
File ~/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:536 in _warn_or_raise_about_fit_failures
raise ValueError(all_fits_failed_message)
ValueError:
All the 3168 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
3168 fits failed with the following error:
Traceback (most recent call last):
File "/home/veronica/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/veronica/anaconda3/envs/rapids-24.02/lib/python3.10/site-packages/cuml/internals/api_decorators.py", line 188, in wrapper
ret = func(*args, **kwargs)
File "svc.pyx", line 545, in cuml.svm.svc.SVC.fit
File "svc.pyx", line 547, in cuml.svm.svc.SVC.fit
MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/veronica/anaconda3/envs/rapids-24.02/include/rmm/mr/device/cuda_memory_resource.hpp
I would like to know if it would be possible to optimize the code so that this error does not occur. I've been looking for a way to set the amount of available memory for GridSearchCV but haven't found the option. My other question is whether it's a good practice to use so many parameters to find the best model or if another strategy is commonly used?
I have a 11th Gen Intel Core i5-11400 @2.60GHz and a NVIDIA GeForce RTX 3080 Ti GPU (12.88 GB available).
The version of rapids is 24.02 and I am running the code using Spyder 3.10.