Getting probability values much higher while fitting the GMM using sklearn

43 views Asked by At

I am trying to fit the GMM to my data which are in 2d , they are small values around 0.002342(like this) so while fitting the GMM , I am getting some probability values very high , below is the code for my gmm please help to rectify the error:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

# Assuming all_clouds_final_output_arrays is already defined
cloud_1_data = all_clouds_final_output_arrays['cloud_1'].reshape(-1, 2)
cloud_2_data = all_clouds_final_output_arrays['cloud_2'].reshape(-1, 2)

# Fit GMM to each cloud's data
gmm_cloud_1 = GaussianMixture(n_components=2, random_state=0,reg_covar=1e-07)  # Adjust reg_covar as needed
gmm_cloud_1.fit(cloud_1_data)

gmm_cloud_2 = GaussianMixture(n_components=2, random_state=0,reg_covar=1e-07)  # Adjust reg_covar as needed
gmm_cloud_2.fit(cloud_2_data)


# Compute BIC and AIC for each model
bic_cloud_1 = gmm_cloud_1.bic(cloud_1_data)
aic_cloud_1 = gmm_cloud_1.aic(cloud_1_data)
bic_cloud_2 = gmm_cloud_2.bic(cloud_2_data)
aic_cloud_2 = gmm_cloud_2.aic(cloud_2_data)

# Print BIC and AIC values
print(f'BIC for Cloud 1: {bic_cloud_1}')
print(f'AIC for Cloud 1: {aic_cloud_1}')
print(f'BIC for Cloud 2: {bic_cloud_2}')
print(f'AIC for Cloud 2: {aic_cloud_2}')

# Determine a common range for both datasets
x_min = min(cloud_1_data[:, 0].min(), cloud_2_data[:, 0].min()) - 0.01
x_max = max(cloud_1_data[:, 0].max(), cloud_2_data[:, 0].max()) + 0.01
y_min = min(cloud_1_data[:, 1].min(), cloud_2_data[:, 1].min()) - 0.01
y_max = max(cloud_1_data[:, 1].max(), cloud_2_data[:, 1].max()) + 0.01

# Initial grid resolution
x_res, y_res = 100, 100

# Tolerance for PDF sum
tolerance = 1e-5
max_iterations = 100  # Limit to prevent infinite loops

for _ in range(max_iterations):
    x, y = np.linspace(x_min, x_max, x_res), np.linspace(y_min, y_max, y_res)
    X, Y = np.meshgrid(x, y)
    XY = np.column_stack([X.ravel(), Y.ravel()])

    # Calculate area represented by each grid point
    dx, dy = x[1] - x[0], y[1] - y[0]
    area_per_point = dx * dy

    # Calculate PDF values
    pdf_cloud_1 = np.exp(gmm_cloud_1.score_samples(XY)).reshape(X.shape)
    pdf_cloud_2 = np.exp(gmm_cloud_2.score_samples(XY)).reshape(X.shape)

    
    # Find maximum probability density values
    max_pdf_cloud_1 = np.max(pdf_cloud_1)
    max_pdf_cloud_2 = np.max(pdf_cloud_2)

    # Check if sums are close to 1
    sum_pdf_cloud_1 = np.sum(pdf_cloud_1) * area_per_point
    sum_pdf_cloud_2 = np.sum(pdf_cloud_2) * area_per_point
    if abs(sum_pdf_cloud_1 - 1) < tolerance and abs(sum_pdf_cloud_2 - 1) < tolerance:
        break

    # Increase resolution
    x_res *= 2
    y_res *= 2
  

# Plotting
# ... (Your plotting code here)
# Plot PDF for cloud_1
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.contourf(X, Y, pdf_cloud_1, levels=50, cmap='viridis')
plt.colorbar()
plt.title('PDF for Cloud 1')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

# Plot PDF for cloud_2
plt.subplot(1, 2, 2)
plt.contourf(X, Y, pdf_cloud_2, levels=50, cmap='viridis')
plt.colorbar()
plt.title('PDF for Cloud 2')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')

plt.show()
# After plotting PDFs, also plot the actual data points for comparison
# Plotting data points for cloud_1 in a separate figure
plt.figure()  # Create a new figure for cloud_1 data points
plt.scatter(cloud_1_data[:, 0], cloud_1_data[:, 1], c='white', edgecolor='black', label='Cloud 1 Data Points')
plt.title('Data Points for Cloud 1')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()
plt.show()

# Plotting data points for cloud_2 in a separate figure
plt.figure()  # Create a new figure for cloud_2 data points
plt.scatter(cloud_2_data[:, 0], cloud_2_data[:, 1], c='white', edgecolor='black', label='Cloud 2 Data Points')
plt.title('Data Points for Cloud 2')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()
plt.show()
# Repeat for cloud_2 in the second subplot

# Calculate and print the sum of PDF values over the grid for each cloud
sum_pdf_cloud_1 = np.sum(pdf_cloud_1) * area_per_point
sum_pdf_cloud_2 = np.sum(pdf_cloud_2) * area_per_point

# Print sums
print(f'Sum of PDF for Cloud 1: {sum_pdf_cloud_1}')
print(f'Sum of PDF for Cloud 2: {sum_pdf_cloud_2}')
print(cloud_1_data)

This are some values : BIC for Cloud 1: -258.1861653468744

AIC for Cloud 1: -261.51460136980893

BIC for Cloud 2: -258.74139165569443

AIC for Cloud 2: -262.06982767862894

Sum of PDF for Cloud 1: 1.0

Sum of PDF for Cloud 2: 1.0

# Check if any probability density values exceed 1 for Cloud 1
max_pdf_value_cloud_1 = np.max(pdf_cloud_1)
print(f'Maximum PDF value for Cloud 1: {max_pdf_value_cloud_1}')
if max_pdf_value_cloud_1 > 1:
    print("Some probability density values are greater than 1 in Cloud 1")
else:
    print("No probability density values exceed 1 in Cloud 1")

# Check if any probability density values exceed 1 for Cloud 2
max_pdf_value_cloud_2 = np.max(pdf_cloud_2)
print(f'Maximum PDF value for Cloud 2: {max_pdf_value_cloud_2}')
if max_pdf_value_cloud_2 > 1:
    print("Some probability density values are greater than 1 in Cloud 2")
else:
    print("No probability density values exceed 1 in Cloud 2")

Maximum PDF value for Cloud 1: 1394614.626800622

Some probability density values are greater than 1 in Cloud 1

Maximum PDF value for Cloud 2: 1459491.4259817367

Some probability density values are greater than 1 in Cloud 2

In the above code I am using the sklearn for GMM and adjusted the grid to get the pdf sum as close to 1 which I am getting but my BIC and AIC values are negative and some probability values are very high, I think becasue my dataset is has very small values so variance is small and that is why this is happening.

0

There are 0 answers