import numpy as np
from scipy.stats import chi2_contingency
class Node_chi2:
def __init__(self, chi2, num_samples, num_samples_per_class, predicted_class):
self.chi2 = chi2
self.num_samples = num_samples
self.num_samples_per_class = num_samples_per_class
self.predicted_class = predicted_class
self.feature_index = 0
self.threshold = 0
self.left = None
self.right = None
class DecisionTree_chi2():
def __init__(self, max_depth=None):
self.max_depth = max_depth
def fit(self, X, y, depth=0):
classes = list(set(y))
num_samples_per_class = [np.sum(y == i) for i in classes]
predicted_class = classes[np.argmax(num_samples_per_class)]
node = Node_chi2(
chi2=0, # chi2 is 0 at the start
num_samples=len(y),
num_samples_per_class=num_samples_per_class,
predicted_class=predicted_class,
)
if depth < self.max_depth:
idx, thr = self.best_split(X, y, classes)
if idx is not None:
indices_left = X[:, idx] < thr
X_left, y_left = X[indices_left], y[indices_left]
X_right, y_right = X[~indices_left], y[~indices_left]
node.feature_index = idx
node.threshold = thr
node.left = self.fit(X_left, y_left, depth + 1)
node.right = self.fit(X_right, y_right, depth + 1)
self.node = node
return self.node
def best_split(self, X, y, classes):
m, n = X.shape
if m <= 1:
return None, None
best_chi2 = 0 # Best chi2 score
best_idx, best_thr = None, None
for idx in range(n):
thresholds, classes_sorted = zip(*sorted(zip(X[:, idx], y)))
for i in range(1, m): # iterate through each threshold
if thresholds[i] == thresholds[i - 1]:
continue
y_left = np.array(classes_sorted[:i])
y_right = np.array(classes_sorted[i:])
left_counts = np.bincount(y_left, minlength=len(classes))
right_counts = np.bincount(y_right, minlength=len(classes))
chi2, p, _ , _ = chi2_contingency(np.array([left_counts, right_counts]))
if chi2 > best_chi2:
best_chi2 = chi2
best_idx = idx
best_thr = (thresholds[i] + thresholds[i - 1]) / 2
return best_idx, best_thr
def predict(self, X):
yhat = []
for sample in X:
node = self.node
while node.left:
if sample[node.feature_index] < node.threshold:
node = node.left
else:
node = node.right
yhat.append(node.predicted_class)
return np.array(yhat)
from sklearn.datasets import make_moons, make_classification
X, y = make_moons(n_samples=100, noise=0.1, random_state=42)
def plot_decision_boundaries(clf, X, y, label=''):
# Generate a grid of points to make predictions:
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.1), np.arange(x2_min, x2_max, 0.1))
# Use the classifier to make predictions on the grid:
Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
'''Here is where the magic happens! Above we have created the (x1, x2) grid,
and now we calculate the class for each and every point.'''
Z = Z.reshape(xx1.shape)
colors = {0: 'C0', 1: 'C1', 2: 'C2'}
# colors dictionary indexed by class label
# Create a contour plot to display the decision boundaries:
fig = plt.figure(figsize=(4, 4))
from matplotlib.colors import ListedColormap
n_classes = len(np.unique(y))
custom_cmap = ListedColormap(list(colors.values())[:n_classes])
plt.contourf(xx1, xx2, Z, cmap=custom_cmap, alpha=0.4)
plt.scatter(X[y==0,0], X[y==0,1], c=colors[0], marker='o', edgecolors='k', alpha=0.5)
plt.scatter(X[y==1,0], X[y==1,1], c=colors[1], marker='o', edgecolors='k', alpha=0.5)
plt.scatter(X[y==2,0], X[y==2,1], c=colors[2], marker='o', edgecolors='k', alpha=0.5)
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Decision Boundaries - '+label+' DT')
plt.show()
clf_gini = DecisionTree_chi2(max_depth=5)
clf_gini.fit(X, y)
plot_decision_boundaries(clf_gini, X, y, label='Gini')
The error that I am getting is "ValueError: The internally computed table of expected frequencies has a zero element at (0, 0)." . The only solution that I found is to put:
left_counts = np.bincount(y_left, minlength=len(classes)) + 1e-10 # add small constant to avoid zero frequencies
right_counts = np.bincount(y_right, minlength=len(classes)) + 1e-10 # add small constant to avoid zero frequencies
but I am not satisfied with that answer and I would like to fix it without having to add a small constant. It doesnt not seem correct to me so
The input that is causing the error is essentially
I don't know what your use case is, but let's adapt the example from the documentation of
chi2_contingency
to your data.The test can't cannot be expected to return a meaningful result if none of the participants were given aspirin.
You'll need to consider what this means in the context of your problem and reassess whether you want to provide that input. Instead of an error, the only correct result that the function could provide would have a statistic of
np.nan
. Technically, the p-value would also benp.nan
, but practically speaking, you could consider it to be 1.0 if you prefer (regardless of your significance threshold, it would not be interpreted as evidence against the null hypothesis).