Solving question 8M9 from Bayesian Modeling and Computation in Python

24 views Asked by At

In the GitHub repository you will find a data set of the distribution of citations of scientific papers. Use SMC-ABC to fit a g-and-k distribution to this dataset. Perform all the necessary steps to find asuitable value for “epsilon” and ensuring the model converge and results provides a suitable fit.

this i the code i already wrote :

%matplotlib inline
import arviz as az
import matplotlib.pyplot as plt
import pymc as pm
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from scipy import optimize

#reading data
data = pd.read_csv("sc.csv")
values = data[data.columns[0]].values

#define the G-and-K quantile class
class g_and_k_quantile:
    def __init__(self):
        self.quantile_normal = stats.norm(0, 1).ppf

    def ppf(self, x, a, b, g, k):
        z = self.quantile_normal(x)
        return a + b * (1 + 0.8 * np.tanh(g * z / 2)) * ((1 + z**2)**k) * z

#step 2: estimate G-and-K parameters
def objective(params, data):
    a, b, g, k = params
    gk = g_and_k_quantile()
    quantiles = np.linspace(0.01, 0.99, len(data))  #avoiding 0 and 1
    theoretical = gk.ppf(quantiles, a, b, g, k)
    empirical = np.quantile(data, quantiles)
    return np.sum((theoretical - empirical) ** 2)

initial_params = [np.median(values), np.std(values), 0.1, 0.1]
result = optimize.minimize(objective, initial_params, args=(values,), method='Nelder-Mead')
a, b, g, k = result.x

#step 3
gk = g_and_k_quantile()
simulated_data = gk.ppf(np.random.uniform(0.01, 0.99, 1000), a, b, g, k)

# Plotting the original data
plt.figure(figsize=(10, 5))
plt.hist(values, bins=50, alpha=0.6, color='blue', label='Actual Data')
plt.xlabel('Value')
plt.ylabel('Count')
plt.title('Histogram of Actual Data')
plt.legend()
plt.show()

# Plotting the G-and-K distribution data
plt.figure(figsize=(10, 5))
plt.hist(simulated_data, bins=50, density=True, alpha=0.6, color='red', label='Simulated G-and-K')
plt.xlabel('Simulated Value')
plt.ylabel('Density')
plt.title('Histogram of Simulated G-and-K Distribution')
plt.legend()
plt.show()

import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
from scipy.stats import norm

# Make sure to import pandas for reading the CSV
data = pd.read_csv("sc.csv")
observed_data = data.iloc[:, 0].values
bsas_co = observed_data  # Make sure bsas_co is the numpy array of observed values

class g_and_k_quantile:
    def __init__(self):
        self.quantile_normal = norm.ppf

    def ppf(self, x, a, b, g, k):
        z = self.quantile_normal(x)
        return a + b * (1 + 0.8 * np.tanh(g * z / 2)) * ((1 + z**2)**k) * z

# Updated to accept rng and size
def gk_simulator(a, b, g, k):
    return gk.rvs(len(bsas_co), a, b, g, k)

def octo_summary(data):
    quantiles = np.quantile(data, [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875])
    sa = quantiles[3]  # Median
    sb = quantiles[5] - quantiles[1]  # IQR
    sg = (quantiles[5] + quantiles[1] - 2 * sa) / sb  # Scaled difference
    sk = (quantiles[6] - quantiles[4] + quantiles[2] - quantiles[0]) / sb  # Scaled range
    return np.array([sa, sb, sg, sk])

with pm.Model() as model:
    # Define parameters
    a = pm.Normal('a', mu=0, sigma=10)
    b = pm.HalfNormal('b', sigma=5)
    g = pm.Normal('g', mu=0, sigma=1)
    k = pm.HalfNormal('k', sigma=1)

    # Simulator
    observed = pm.Simulator('observed', gk_simulator, params=[a, b, g, k], 
                            epsilon=0.1, observed=observed_data)

    # Sample using SMC
    trace = pm.sample_smc(draws=1000)

az.plot_trace(trace_gk)
plt.show()



first of all the second part of the question "Perform all the necessary steps to find asuitable value for “epsilon” and ensuring the model converge and results provides a suitable fit." is not working, and i dont know if my solution is the right solution for both parts .. i will be happy if someone could help find the errors in my code, or if there is any solution of this book code so let me know

thanks in advance

0

There are 0 answers