Data is normally distributed, but ks test return a statistic of 1.0

62 views Asked by At

I have an age variable. When I plotted it using the kde & qq-plot, the distribution seemed normal; however, when I performed the ks-test, the test statistics = 1.0, p = 0.0.

Can someone please help me explain this observation? I use the ks-test on other variables, and the result was consistent with the visualization for others.

# library
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sps                         

# the age variable 
age = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
       66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
       88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
       64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
       69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
       73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
       69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
       73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
       78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
       86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
       92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
       70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
       77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
       70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
       78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
       88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
       67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
       68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
       83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
       78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
       60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
       75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])

# Visualization 
fig, ax = plt.subplots(1,2)                          # Making (row, col) of plots  
fig.set_figheight(4)                                 # set height 
fig.set_figwidth(8)                                  # set width
sns.kdeplot(age, color = 'red',
                alpha = .1, fill = 'true',
                ax = ax[0])                          # Distribution plot
sm.qqplot(age, fit = True, line = '45', ax = ax[1])  # qqplot
fig.tight_layout()                                   # Tight layout
plt.show()                                           # show plots

# KS test (because n > 50)
print('n =', age.size)
sps.kstest(age, 'norm')

Output

1

There are 1 answers

1
Robert Long On BEST ANSWER

@Timur Shtatland is correct. Your code is:

sps.kstest(age, 'norm')

without specifying the parameters of the normal distribution, you are comparing your data to a standard normal distribution (with mean 0 and standard deviation 1). So it not surprising that the p-value for the test is effectively zero. Instead you should use the mean and standard deviation of your data:

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy import stats

# Data
data = np.array([87, 88, 75, 76, 80, 88, 90, 80, 83, 85, 71, 73, 75, 93, 95, 68, 69,
        66, 68, 78, 80, 83, 81, 82, 85, 76, 77, 88, 90, 80, 81, 85, 86, 87,
        88, 92, 80, 82, 84, 72, 76, 61, 64, 86, 87, 82, 84, 69, 71, 73, 74,
        64, 66, 77, 80, 60, 62, 86, 88, 91, 90, 92, 79, 80, 82, 84, 88, 89,
        69, 70, 73, 75, 82, 85, 88, 89, 81, 83, 84, 86, 88, 71, 73, 75, 70,
        73, 72, 73, 68, 69, 71, 75, 77, 83, 85, 77, 78, 66, 66, 68, 68, 69,
        69, 70, 71, 71, 72, 92, 94, 97, 74, 78, 82, 84, 85, 87, 65, 67, 71,
        73, 81, 83, 85, 78, 79, 80, 75, 78, 68, 70, 72, 79, 81, 83, 80, 81,
        78, 81, 82, 61, 62, 67, 68, 71, 73, 88, 90, 81, 82, 80, 82, 84, 85,
        86, 83, 84, 70, 72, 75, 76, 77, 73, 75, 66, 69, 71, 69, 73, 89, 91,
        92, 69, 71, 73, 66, 68, 69, 82, 84, 78, 80, 63, 65, 96, 98, 78, 80,
        70, 72, 73, 75, 76, 75, 78, 83, 84, 61, 63, 71, 72, 74, 89, 91, 74,
        77, 66, 67, 80, 83, 77, 80, 82, 71, 74, 76, 82, 84, 86, 69, 74, 75,
        70, 71, 86, 87, 70, 72, 77, 79, 81, 83, 62, 65, 76, 78, 73, 75, 76,
        78, 73, 75, 73, 74, 76, 78, 67, 71, 81, 83, 85, 76, 78, 73, 74, 86,
        88, 70, 71, 74, 75, 77, 79, 81, 81, 84, 86, 76, 79, 78, 80, 82, 65,
        67, 78, 81, 70, 71, 74, 78, 74, 75, 73, 75, 67, 68, 76, 78, 81, 65,
        68, 69, 71, 89, 91, 93, 77, 79, 68, 73, 80, 82, 77, 78, 80, 82, 81,
        83, 73, 75, 66, 68, 69, 75, 77, 78, 81, 73, 75, 73, 76, 73, 76, 76,
        78, 77, 79, 80, 82, 84, 77, 79, 78, 80, 71, 73, 76, 77, 81, 75, 79,
        60, 62, 64, 70, 72, 73, 84, 87, 89, 68, 70, 89, 90, 93, 79, 81, 74,
        75, 77, 73, 75, 66, 66, 68, 72, 72, 73, 80, 82, 86, 61, 63, 65])


# Fit a normal distribution to the data
mu, std = norm.fit(data)

shapiro_test = stats.shapiro(data)

print("\nShapiro-Wilk Test:")
print("Statistic: {:.2f}".format(shapiro_test[0]))
print("p-value: {:.2f}".format(shapiro_test[1]))

# Perform the KS test for normality
ks_statistic, p_value = stats.kstest(data, 'norm', args=(mu, std))

print("\nKolmogorov-Smirnov Test:")
print("Statistic: {:.2f}".format(ks_statistic))
print("p-value: {:.2f}".format(p_value))```

which produces this:

Shapiro-Wilk Test:
Statistic: 0.99
p-value: 0.07

Kolmogorov-Smirnov Test:
Statistic: 0.05
p-value: 0.21

I would also plot a histogram of the data and then overlay a normal density with the parameters from your data:

# Create histogram of the data
count, bins, ignored = plt.hist(data, 20, density=True, alpha=0.5, color='gray')

# Plot the PDF of the normal distribution
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)

plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
plt.title(title)

plt.xlabel('Accuracy')
plt.ylabel('Density')
plt.show()

enter image description here