Module `stikpetP.correlations.cor_goodman_kruskal_gamma`

Expand source code

import pandas as pd
from statistics import NormalDist
from ..other.table_cross import tab_cross

def r_goodman_kruskal_gamma(ordField1, ordField2, levels1=None, levels2=None, ase="appr", useRanks=False):
    '''
    Goodman-Kruskal Gamma
    ---------------------
    A rank correlation coefficient. It ranges from -1 (perfect negative association) to 1 (perfect positive association). A zero would indicate no correlation at all.
    
    A positive correlation indicates that if someone scored high on the first field, they also likely score high on the second, while a negative correlation would indicate a high score on the first would give a low score on the second.
    
    Alternatives for Gamma are Kendall Tau, Stuart-Kendall Tau and Somers D, but also Spearman rho could be considered.
    
    Gamma looks at so-called discordant and concordant pairs, and ignores tied pairs. Kendall Tau b does the same, but applies a correction for ties. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.
    
    Parameters
    ----------
    ordField1 : pandas series
        the ordinal or scale scores of the first variable
    ordField2 : pandas series
        the ordinal or scale scores of the second variable
    levels1 : list or dictionary, optional
        the categories to use from ordField1
    levels2 : list or dictionary, optional
        the categories to use from ordField2
    ase : {"appr", 0, 1} : optional
        which asymptotic standard error to use. Default is "appr"
        
    Returns
    -------
    A dataframe with:
    
    * *gamma*, the gamma value
    * *statistic*, the test statistic (z-value)
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used (Goodman & Kruskal, 1954, p. 749):
    $$\\gamma = \\frac{P-Q}{P+Q}$$
    
    With:
    $$P = \\sum_{i,j} P_{i,j}$$
    $$Q = \\sum_{i,j} Q_{i,j}$$
    $$P_{i,j} = F_{i,j}\\times C_{i,j}$$
    $$Q_{i,j} = F_{i,j}\\times D_{i,j}$$
    $$C_{i,j} = \\sum_{h<i}\\sum_{k<j} F_{h,k} + \\sum_{h>i}\\sum_{k>j} F_{h,k}$$
    $$D_{i,j} = \\sum_{h<i}\\sum_{k>j} F_{h,k} + \\sum_{h>i}\\sum_{k<j} F_{h,k}$$
    
    The test can be done with a generic approximation:
    $$z_{\\gamma} = \\gamma\\times\\sqrt{\\frac{P+Q}{n\\times\\left(1-\\gamma^2\\right)}}$$
    
    If we assume the alternative hypothesis we can obtain (Goodman & Kruskal, 1963, p. 324; Goodman & Kruskal, 1972, p. 416; Brown & Benedetti, 1977, p. 310):
    $$z_{\\gamma} = \\frac{\\gamma}{ASE_1}$$
    $$ASE_1 = \\frac{4}{\\left(P+Q\\right)^2}\\times\\sqrt{\\sum_{i=1}^r\\sum_{j=1}^c F_{i,j}\\times\\left(Q\\times C_{i,j}-P\\times D_{i,j}\\right)^2}$$
    
    While if we assume the null hypothesis we can obtain (Brown & Benedetti, 1977, p. 311):
    $$z_{\\gamma} = \\frac{\\gamma}{ASE_0}$$
    $$ASE_0 = \\frac{2}{P+Q}\\times\\sqrt{\\sum_{i=1}^r\\sum_{j=1}^c F_{i,j}\\times\\left(C_{i,j}- D_{i,j}\\right)^2 - \\frac{\\left(P-Q\\right)^2}{n}}$$
    
    The significance (p-value) in each case is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|z_{gamma}\\right|\\right)\\right)$$
    
    *Symbols Used*
    
    * \\(F_{i,j}\\), the number of cases in row i, column j.
    * \\(n\\), the total sample size
    * \\(r\\), the number of rows
    * \\(c\\), the number of columns
    * \\(\\Phi\\left(\\dots\\right)\\), the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Brown, M. B., & Benedetti, J. K. (1977). Sampling behavior of test for correlation in two-way contingency tables. *Journal of the American Statistical Association, 72*(358), 309–315. doi:10.2307/2286793
    
    Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. *Advances in Methodology and Statistics, 8*(1). doi:10.51936/milh5641
    
    Goodman, L. A., & Kruskal, W. H. (1963). Measures of association for cross classifications III: Approximate sampling theory. *Journal of the American Statistical Association, 58*(302), 310–364. doi:10.1080/01621459.1963.10500850
    
    Goodman, L. A., & Kruskal, W. H. (1972). Measures of association for cross classifications IV: Simplification of asymptotic variances. *Journal of the American Statistical Association, 67*(338), 415–421. doi:10.1080/01621459.1972.10482401
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    ''' 
    
    
    ct = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2)
    k1 = ct.shape[0]
    k2 = ct.shape[1]
    
    if useRanks==False:
        if levels1 is not None:
            #replace row labels with numeric score
            ct = ct.reset_index(drop=True)
            
        if levels2 is not None:            
            ct.columns = [i for i in range(0, k2)]
    
    n = 0
    conc = [[0]*k1]*k2
    disc = [[0]*k1]*k2
    conc = pd.DataFrame(conc)
    disc = pd.DataFrame(disc)
    
    for i in range(0, k1):
        for j in range(0, k2):
            for h in range(0, k1):
                for k in range(0, k2):
                    
                    if useRanks:
                        if h > i and k > j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k<j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h>i and k<j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k>j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]                        
                    else:
                        if ct.index[h] > ct.index[i] and ct.columns[k] > ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] < ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] > ct.index[i] and ct.columns[k] < ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] > ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
            n = n + ct.iloc[i,j]
    
    ct = ct.reset_index(drop=True)
    ct.columns = [i for i in range(0, k2)]
    
    p = (ct*conc).sum().sum()
    q = (ct*disc).sum().sum()
    
    g = (p - q)/(p + q)
    
    if ase=="appr":
        z = g * ((p + q) / (n * (1 - g**2)))**0.5    
    else:
        ase1 = 4*((ct*(q*conc - p*disc)**2).sum().sum())**0.5 / ((p + q)**2)
        ase0 = 2*((ct*(conc-disc)**2).sum().sum() - (p -q)**2 / n)**0.5 / (p+q)
        
        if ase==0:
            z = g/ase0
        else:
            z = g/ase1
    
    pValue = 2 * (1 - NormalDist().cdf(abs(z))) 
    
    res = pd.DataFrame([[g, z, pValue]])
    res.columns = ["gamma", "statistic", "p-value"]
    
    return res

Functions

def r_goodman_kruskal_gamma(ordField1, ordField2, levels1=None, levels2=None, ase='appr', useRanks=False)

Goodman-Kruskal Gamma

A rank correlation coefficient. It ranges from -1 (perfect negative association) to 1 (perfect positive association). A zero would indicate no correlation at all.

A positive correlation indicates that if someone scored high on the first field, they also likely score high on the second, while a negative correlation would indicate a high score on the first would give a low score on the second.

Alternatives for Gamma are Kendall Tau, Stuart-Kendall Tau and Somers D, but also Spearman rho could be considered.

Gamma looks at so-called discordant and concordant pairs, and ignores tied pairs. Kendall Tau b does the same, but applies a correction for ties. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.

Parameters

ordField1 : pandas series: the ordinal or scale scores of the first variable
ordField2 : pandas series: the ordinal or scale scores of the second variable
levels1 : list or dictionary, optional: the categories to use from ordField1
levels2 : list or dictionary, optional: the categories to use from ordField2
ase : {"appr", 0, 1} : optional: which asymptotic standard error to use. Default is "appr"

Returns

A dataframe with:

gamma, the gamma value
statistic, the test statistic (z-value)
p-value, the p-value (significance)

Notes

The formula used (Goodman & Kruskal, 1954, p. 749): $\gamma = \frac{P-Q}{P+Q}$

With: $P = \sum_{i,j} P_{i,j}$ $Q = \sum_{i,j} Q_{i,j}$ $P_{i,j} = F_{i,j}\times C_{i,j}$ $Q_{i,j} = F_{i,j}\times D_{i,j}$ $C_{i,j} = \sum_{h<i}\sum_{k<j} F_{h,k} + \sum_{h>i}\sum_{k>j} F_{h,k}$ $D_{i,j} = \sum_{h<i}\sum_{k>j} F_{h,k} + \sum_{h>i}\sum_{k<j} F_{h,k}$

The test can be done with a generic approximation: $z_{\gamma} = \gamma\times\sqrt{\frac{P+Q}{n\times\left(1-\gamma^2\right)}}$

If we assume the alternative hypothesis we can obtain (Goodman & Kruskal, 1963, p. 324; Goodman & Kruskal, 1972, p. 416; Brown & Benedetti, 1977, p. 310): $z_{\gamma} = \frac{\gamma}{ASE_1}$ $ASE_1 = \frac{4}{\left(P+Q\right)^2}\times\sqrt{\sum_{i=1}^r\sum_{j=1}^c F_{i,j}\times\left(Q\times C_{i,j}-P\times D_{i,j}\right)^2}$

While if we assume the null hypothesis we can obtain (Brown & Benedetti, 1977, p. 311): $z_{\gamma} = \frac{\gamma}{ASE_0}$ $ASE_0 = \frac{2}{P+Q}\times\sqrt{\sum_{i=1}^r\sum_{j=1}^c F_{i,j}\times\left(C_{i,j}- D_{i,j}\right)^2 - \frac{\left(P-Q\right)^2}{n}}$

The significance (p-value) in each case is then determined using: $sig. = 2\times\left(1 - \Phi\left(\left|z_{gamma}\right|\right)\right)$

Symbols Used

$F_{i,j}$ , the number of cases in row i, column j.
$n$ , the total sample size
$r$ , the number of rows
$c$ , the number of columns
$\Phi\left(\dots\right)$ , the cumulative distribution function of the standard normal distribution.

References

Brown, M. B., & Benedetti, J. K. (1977). Sampling behavior of test for correlation in two-way contingency tables. Journal of the American Statistical Association, 72(358), 309–315. doi:10.2307/2286793

Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. Advances in Methodology and Statistics, 8(1). doi:10.51936/milh5641

Goodman, L. A., & Kruskal, W. H. (1963). Measures of association for cross classifications III: Approximate sampling theory. Journal of the American Statistical Association, 58(302), 310–364. doi:10.1080/01621459.1963.10500850

Goodman, L. A., & Kruskal, W. H. (1972). Measures of association for cross classifications IV: Simplification of asymptotic variances. Journal of the American Statistical Association, 67(338), 415–421. doi:10.1080/01621459.1972.10482401

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def r_goodman_kruskal_gamma(ordField1, ordField2, levels1=None, levels2=None, ase="appr", useRanks=False):
    '''
    Goodman-Kruskal Gamma
    ---------------------
    A rank correlation coefficient. It ranges from -1 (perfect negative association) to 1 (perfect positive association). A zero would indicate no correlation at all.
    
    A positive correlation indicates that if someone scored high on the first field, they also likely score high on the second, while a negative correlation would indicate a high score on the first would give a low score on the second.
    
    Alternatives for Gamma are Kendall Tau, Stuart-Kendall Tau and Somers D, but also Spearman rho could be considered.
    
    Gamma looks at so-called discordant and concordant pairs, and ignores tied pairs. Kendall Tau b does the same, but applies a correction for ties. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.
    
    Parameters
    ----------
    ordField1 : pandas series
        the ordinal or scale scores of the first variable
    ordField2 : pandas series
        the ordinal or scale scores of the second variable
    levels1 : list or dictionary, optional
        the categories to use from ordField1
    levels2 : list or dictionary, optional
        the categories to use from ordField2
    ase : {"appr", 0, 1} : optional
        which asymptotic standard error to use. Default is "appr"
        
    Returns
    -------
    A dataframe with:
    
    * *gamma*, the gamma value
    * *statistic*, the test statistic (z-value)
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used (Goodman & Kruskal, 1954, p. 749):
    $$\\gamma = \\frac{P-Q}{P+Q}$$
    
    With:
    $$P = \\sum_{i,j} P_{i,j}$$
    $$Q = \\sum_{i,j} Q_{i,j}$$
    $$P_{i,j} = F_{i,j}\\times C_{i,j}$$
    $$Q_{i,j} = F_{i,j}\\times D_{i,j}$$
    $$C_{i,j} = \\sum_{h<i}\\sum_{k<j} F_{h,k} + \\sum_{h>i}\\sum_{k>j} F_{h,k}$$
    $$D_{i,j} = \\sum_{h<i}\\sum_{k>j} F_{h,k} + \\sum_{h>i}\\sum_{k<j} F_{h,k}$$
    
    The test can be done with a generic approximation:
    $$z_{\\gamma} = \\gamma\\times\\sqrt{\\frac{P+Q}{n\\times\\left(1-\\gamma^2\\right)}}$$
    
    If we assume the alternative hypothesis we can obtain (Goodman & Kruskal, 1963, p. 324; Goodman & Kruskal, 1972, p. 416; Brown & Benedetti, 1977, p. 310):
    $$z_{\\gamma} = \\frac{\\gamma}{ASE_1}$$
    $$ASE_1 = \\frac{4}{\\left(P+Q\\right)^2}\\times\\sqrt{\\sum_{i=1}^r\\sum_{j=1}^c F_{i,j}\\times\\left(Q\\times C_{i,j}-P\\times D_{i,j}\\right)^2}$$
    
    While if we assume the null hypothesis we can obtain (Brown & Benedetti, 1977, p. 311):
    $$z_{\\gamma} = \\frac{\\gamma}{ASE_0}$$
    $$ASE_0 = \\frac{2}{P+Q}\\times\\sqrt{\\sum_{i=1}^r\\sum_{j=1}^c F_{i,j}\\times\\left(C_{i,j}- D_{i,j}\\right)^2 - \\frac{\\left(P-Q\\right)^2}{n}}$$
    
    The significance (p-value) in each case is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|z_{gamma}\\right|\\right)\\right)$$
    
    *Symbols Used*
    
    * \\(F_{i,j}\\), the number of cases in row i, column j.
    * \\(n\\), the total sample size
    * \\(r\\), the number of rows
    * \\(c\\), the number of columns
    * \\(\\Phi\\left(\\dots\\right)\\), the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Brown, M. B., & Benedetti, J. K. (1977). Sampling behavior of test for correlation in two-way contingency tables. *Journal of the American Statistical Association, 72*(358), 309–315. doi:10.2307/2286793
    
    Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. *Advances in Methodology and Statistics, 8*(1). doi:10.51936/milh5641
    
    Goodman, L. A., & Kruskal, W. H. (1963). Measures of association for cross classifications III: Approximate sampling theory. *Journal of the American Statistical Association, 58*(302), 310–364. doi:10.1080/01621459.1963.10500850
    
    Goodman, L. A., & Kruskal, W. H. (1972). Measures of association for cross classifications IV: Simplification of asymptotic variances. *Journal of the American Statistical Association, 67*(338), 415–421. doi:10.1080/01621459.1972.10482401
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    ''' 
    
    
    ct = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2)
    k1 = ct.shape[0]
    k2 = ct.shape[1]
    
    if useRanks==False:
        if levels1 is not None:
            #replace row labels with numeric score
            ct = ct.reset_index(drop=True)
            
        if levels2 is not None:            
            ct.columns = [i for i in range(0, k2)]
    
    n = 0
    conc = [[0]*k1]*k2
    disc = [[0]*k1]*k2
    conc = pd.DataFrame(conc)
    disc = pd.DataFrame(disc)
    
    for i in range(0, k1):
        for j in range(0, k2):
            for h in range(0, k1):
                for k in range(0, k2):
                    
                    if useRanks:
                        if h > i and k > j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k<j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h>i and k<j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k>j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]                        
                    else:
                        if ct.index[h] > ct.index[i] and ct.columns[k] > ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] < ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] > ct.index[i] and ct.columns[k] < ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] > ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
            n = n + ct.iloc[i,j]
    
    ct = ct.reset_index(drop=True)
    ct.columns = [i for i in range(0, k2)]
    
    p = (ct*conc).sum().sum()
    q = (ct*disc).sum().sum()
    
    g = (p - q)/(p + q)
    
    if ase=="appr":
        z = g * ((p + q) / (n * (1 - g**2)))**0.5    
    else:
        ase1 = 4*((ct*(q*conc - p*disc)**2).sum().sum())**0.5 / ((p + q)**2)
        ase0 = 2*((ct*(conc-disc)**2).sum().sum() - (p -q)**2 / n)**0.5 / (p+q)
        
        if ase==0:
            z = g/ase0
        else:
            z = g/ase1
    
    pValue = 2 * (1 - NormalDist().cdf(abs(z))) 
    
    res = pd.DataFrame([[g, z, pValue]])
    res.columns = ["gamma", "statistic", "p-value"]
    
    return res