I am trying to run a NLP Sentiment Analysis using Naives Bayes algorithm. But I encounter the following error in the final stage, In [13]:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[13], line 1
----> 1 acc = accuracy(X_train, Y_train, X_test, Y_test)
2 acc
Cell In[12], line 5, in accuracy(X_train, Y_train, X_test, Y_test)
2 pred = []
4 for i in range(X_test.shape[0]):
----> 5 p = predict(X_train,Y_train, X_test[i])
6 pred.append(p)
8 Y_pred = np.array(pred)
Cell In[11], line 10, in predict(X_train, Y_train, X_test)
8 likelihood = 1.0
9 for fea in range(n_features):
---> 10 cond = cond_probab(X_train, Y_train, fea, X_test[fea], label)
11 likelihood = likelihood * cond
13 prior = prior_probab(Y_train, label)
IndexError: invalid index to scalar variable.
The whole code in Python (in Jupyter Notebook) is the following:
In 1: Import
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
In [2]: Import dataset from Kaggle: Mushroom Classification
df = pd.read_csv("C:/.../mushrooms.csv")
In [3 - 5]: Convert values into numbers
le = LabelEncoder()
df_encoded = df.apply(le.fit_transform, axis = 0)
df = df_encoded.values
In [6 - 7]: Define X and Y datasets
X = df[: , 1: ]
Y = df[:,0]
In [8]: Split dataset
X_train, Y_train, X_test, Y_test = train_test_split(
X, Y, test_size = 0.20, random_state = 42)
In [9]: Create a function to find Prior Probability:
def prior_probab(Y_train, label):
m = Y_train.shape[0]
s = np.sum(Y_train == label)
return m/s
In [10]: Create Condition Probability to find Likelihood
def cond_probab(X_train, Y_train, feature_col, feature_val, label):
X_filtered = X_train[Y_train == label]
num = np.sum(X_filtered[:,feature_col] == feature_val)
denom = X_filtered.shape[0]
return float(num/denom)
In [11]: Create Prediction
def predict(X_train, Y_train, X_test):
classes = np.unique(Y_train)
n_features = X_train.shape[1]
posterior_probab = []
for label in classes:
likelihood = 1.0
for fea in range(n_features):
cond = cond_probab(X_train, Y_train, fea, X_test[fea], label)
likelihood = likelihood * cond
prior = prior_probab(Y_train, label)
post = likelihood * prior
posterior_probab.append(post)
pred = np.argmax(posterior_probab)
return pred
In [12]: Create Accuracy
def accuracy(X_train, Y_train, X_test, Y_test):
pred = []
for i in range(X_test.shape[0]):
p = predict(X_train,Y_train, X_test[i])
pred.append(p)
Y_pred = np.array(pred)
acc = np.sum(Y_pred == Y_test)/Y_pred.shape[0]
return acc
In [13]: Result
acc = accuracy(X_train, Y_train, X_test, Y_test)
acc