Error on classification_report after used LabelEncoder and Xgboost

47 views Asked by At

I am trying to perform text classification using sklearn on this dataset and compare different classifiers to find the better one.

I am using the most recently version of XGBoost where it is mandatory to use something like LabelEncoder to encode the y_train.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
import re, nltk, sys

dataset = load_dataset("celsowm/bbc_news_ptbr")
df = pd.DataFrame(dataset['train'])
X = df['texto']
y = df['categoria']

stemmer = RSLPStemmer()
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    
    text = re.sub(r'[^a-zA-ZÀ-Úà-ú]', ' ', text.lower())
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and len(token) > 1]
    return ' '.join(tokens)

X = X.apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifiers = [
    ("Multinomial Naive Bayes", MultinomialNB()),
    ("XGBoost", XGBClassifier(tree_method='gpu_hist'))
]

le = LabelEncoder()

for classifier_name, classifier in classifiers:
    
    print(f"treinando com {classifier_name}:")
    
    y_train = le.fit_transform(y_train)
    classifier.fit(X_train_tfidf, y_train)
    y_pred = classifier.predict(X_test_tfidf)
    y_pred = le.inverse_transform(y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Precisão: {accuracy:.2f}')
    classification_rep = classification_report(y_test, y_pred)
    print(f"Classificador: {classifier_name}")
    print(classification_rep)
    print("\n")

MultinomialNB worked very well, but XGBoost on classification_report raised this error :

Mix of label input types (string and number)

How can I fix that?

0

There are 0 answers