I am trying to perform text classification using sklearn
on this dataset and compare different classifiers to find the better one.
I am using the most recently version of XGBoost
where it is mandatory to use something like LabelEncoder
to encode the y_train.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
import re, nltk, sys
dataset = load_dataset("celsowm/bbc_news_ptbr")
df = pd.DataFrame(dataset['train'])
X = df['texto']
y = df['categoria']
stemmer = RSLPStemmer()
stop_words = set(stopwords.words('portuguese'))
def preprocess_text(text):
text = re.sub(r'[^a-zA-ZÀ-Úà-ú]', ' ', text.lower())
tokens = word_tokenize(text)
tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and len(token) > 1]
return ' '.join(tokens)
X = X.apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
classifiers = [
("Multinomial Naive Bayes", MultinomialNB()),
("XGBoost", XGBClassifier(tree_method='gpu_hist'))
]
le = LabelEncoder()
for classifier_name, classifier in classifiers:
print(f"treinando com {classifier_name}:")
y_train = le.fit_transform(y_train)
classifier.fit(X_train_tfidf, y_train)
y_pred = classifier.predict(X_test_tfidf)
y_pred = le.inverse_transform(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f'Precisão: {accuracy:.2f}')
classification_rep = classification_report(y_test, y_pred)
print(f"Classificador: {classifier_name}")
print(classification_rep)
print("\n")
MultinomialNB
worked very well, but XGBoost
on classification_report
raised this error :
Mix of label input types (string and number)
How can I fix that?