How to transform U to V in Copula

60 views Asked by At

I have two set of variables, one is a function of the other one. I am trying to first fit a copula to them using "copulas package" and then perform extreme value analysis for the dependent variable. Finally, I want to obtain the concurrent independent variable using the dependent variable and theta (I am using archimedean copula). For the last part I am having trouble and don't know how to solve it. I already used ChatGPT and got some ideas but they are not efficient. I would appreciate if someone could give me ideas or help me.

Thanks

def check_and_coupla_fit(df, ice_dist, sped_dist):
    """
    check the copula and return the 
    """

    ice = ice_dist.cdf(df.jones)
    sped = sped_dist.cdf(df.sped)


    tau_values = []
    rho_values = []
    theta_values = []
    for copula_type in ['clayton', 'frank', 'gumbel']:
        copula = getattr(copulas, copula_type.capitalize() + 'Copula')
        copula_instance = copula()
        copula_instance.fit([ice, sped])
        
        # Compute tau and rho
        u, v = copula_instance.sample(1000)
        tau, rho = compute_tau_and_rho(u, v)
        
        tau_values.append(tau)
        rho_values.append(rho)
        theta_values.append(copula_instance.compute_theta())
        
    # Determine the best copula based on tau and rho
    best_idx = np.argmax(np.array(tau_values) + np.array(rho_values))
    
    # Create the result array
    result = np.full((3, 1), np.nan)
    result[best_idx] = theta_values[best_idx]
    
    return result

def compute_tau_and_rho(u, v):
    """
    Compute Kendall's tau and Spearman's correlation coefficient.
    """
    tau, _ = copulas.stats.kendall_tau(u, v)
    rho, _ = spearmanr(u, v)
    return tau, rho

def get_stat(df):
    """
    gets distribution parameters for both ice accretion and wind speed.
    """
    
    # Extract the year from the 'valid' column
    df['year'] = df['valid'].dt.year

    # Group by year and get the maximum annual ice accretion values from the dataset
    max_ice_values = df.groupby(['year'])['jones'].max().reset_index()

    df_max_with_sped = pd.merge(max_ice_values, df,
                                on = ['jones', 'year'],
                                how = 'inner')[['jones', 'sped']]
    
    c_sped,loc_sped, scale_sped = weibull_min.fit(df_max_with_sped.sped)
    c_ice, loc_ice, scale_ice = genextreme.fit(df_max_with_sped.jones)
    
    return c_ice, loc_ice, scale_ice, c_sped,loc_sped, scale_sped
def inv_gumbel(u, theta):
    """
    Inverse of the Gumbel copula for given u and theta.
    """
    return np.exp(-(-np.log(u))**(1/theta))


def inv_frank(u, theta):
    """
    Inverse of the Frank copula for given u and theta.
    """
    term = np.log(1 + (np.exp(-theta*u) - 1) * (np.exp(-theta) - 1) / (np.exp(-theta*u) - np.exp(-theta) - 1))
    return -1/theta * np.log(1 - term)


def inv_clayton(u, theta):
    """
    Inverse of the Clayton copula for given u and theta.
    """
    return (u**(-theta) + 1)**(-1/theta)
def get_concurrent_wind(arr, U):
    """
    transforms U to V
    arr: vector of thetas
    U: is standardized ice value
    """
    V = 0
    if np.isnan(arr[0]):
        # Clayton copula
        V = inv_clayton(U, arr[0])
        
    elif np.isnan(arr[1]):
        # Frank copula
        V = inv_frank(U, arr[1])
        
    elif np.isnan(arr[2]):
        # Gumbel copula
        V = inv_gumbel(U, arr[2])
        
    else:
        raise ValueError('Size of arr should be 3, not more!');

    return V
stations = df.station.unique()
# unique stations

Results = np.array([])
counter_rows = 0
proceeed_statistical_analysis = True
for counter_rows, station in bar(enumerate(stations), total=203):
    
    station_data = df.loc[df.station == station].copy()
    # EVA for each station
    station_data = station_data[['valid','jones']]
    # only accounts for conditions in which EVA can be implemented! 
    try:
        station_data.set_index('valid',
                               inplace = True)
        
        station_data = station_data.squeeze()
        
        model = EVA(station_data)
        
        model.get_extremes(
            method="BM",
            extremes_type="high",
            block_size="365.2425D",
            errors="ignore",
        )

        # Get max values for different return periods (50 and 500 yrs).
        model.fit_model()
        summary = model.get_summary(
        return_period=[50, 500],
        alpha=0.95,
        n_samples=1_000)
        summary = summary.iloc[:, 0]
        
        jones_50 = summary[50]
        jones_500 = summary[500]    

        
                                       
    except:
        jones_50 = jones_500 = sped_50 = sped_500 = np.NaN
        arr = np.zeros([1, 3])
        proceeed_statistical_analysis = False

    if proceeed_statistical_analysis:
        # Obtaining concurrent wind speed for 50 and 500 years return periods.
        station_data = df.loc[df.station == station].copy()
        # EVA for each station
        station_data = station_data[['valid','jones']]
        
        c_ice, loc_ice, scale_ice, c_sped,loc_sped, scale_sped = get_stat(data)
    
        ice_dist = genextreme(c = c_ice,
                              loc = loc_ice,
                              scale = scale_ice)
    
        sped_dist = weibull_min(c = c_sped,
                                loc = loc_sped,
                                scale = scale_sped)
        
        # acquire theta and the corresponding copula function for the observations in the ith station
        arr = check_and_coupla_fit(df = station_data,
                                   ice_dist = ice_dist,
                                   sped_dist = sped_dist)
    
        # computes concurrent 50 years wind speed
        sped_50 = get_concurrent_wind(arr, sped_dist.cdf(jones_50))
    
        # computes concurrent 500 years wind speed
        sped_500 = get_concurrent_wind(arr, sped_dist.cdf(jones_500))

    Temp1 = df.loc[df.station == station].copy()
    lon, lat,elev = Temp1.iloc[0,2], Temp1.iloc[0,3], Temp1.iloc[0,4]

    Temp = np.zeros(shape = [1, 10])
    Temp[0, 0] = lon
    Temp[0, 1] = lat
    Temp[0, 2] = elev
    Temp[0, 3] = jones_50
    Temp[0, 4] = jones_500
    Temp[0, 5] = sped_50
    Temp[0, 6] = sped_500
    Temp[0, 7:] = arr
    
    if counter_rows == 0:
        Results = Temp.copy()
    elif counter_rows >0:
        
        Results = np.concatenate([Results, Temp],
                                axis=0)
    else:
        raise ValueError('Something happened!');

Results = pd.DataFrame(Results,
                      columns=['lon',
                               'lat',
                               'elev',
                               'jones_50',
                               'jones_500',
                               'sped_50',
                               'sped_500',
                               'Clayton',
                               'Frank',
                               'Gumbel'])

Results = Results.loc[~np.isnan(Results.jones_50)]

Update 8-01 I have tried different approaches and finally came to understand that finding V based on only theta and U value is not possible. However, if we can generate sufficient number of samples from the fitted copula function (e.g., Clayton), it is most likely that we get close enough to the set of desired set of values. If define this as an optimization problem, we can find some reasonable results after some iterations that could take some time, depending on the computer memeory.

0

There are 0 answers