Module `stikpetP.correlations.cor_somers_d`

Expand source code

import pandas as pd
from statistics import NormalDist
from ..other.table_cross import tab_cross

def r_somers_d(ordField1, ordField2, levels1=None, levels2=None, useRanks=False):
    '''
    Somers D
    --------
    A rank correlation coefficient. It ranges from -1 (perfect negative association) to 1 (perfect positive association). A zero would indicate no correlation at all.
    
    A positive correlation indicates that if someone scored high on the first field, they also likely score high on the second, while a negative correlation would indicate a high score on the first would give a low score on the second.
    
    Alternatives for Somers D are Gamma, Kendall Tau, and Stuart-Kendall Tau, but also Spearman rho could be considered.
    
    Kendall Tau b looks at so-called discordant and concordant pairs, but unlike Gamma it does not ignore tied pairs. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.
    
    Kendall Tau a is the same as Goodman-Kruskal Gamma.
    
    Parameters
    ----------
    ordField1 : pandas series
        the ordinal or scale scores of the first variable
    ordField2 : pandas series
        the ordinal or scale scores of the second variable
    levels1 : list or dictionary, optional
        the categories to use from ordField1
    levels2 : list or dictionary, optional
        the categories to use from ordField2
    useRanks : boolean, optional
        rank the data first or not. Default is False
        
    Returns
    -------
    A dataframe with:
    
    * *dependent*, which version (all three are in the rows)
    * *Somers d*, the Somers d value
    * *statistic*, the test statistic (z-value)
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    
    The formula used for the asymmetric versions (Somers, 1962, p. 804):
    $$d_{Y|X} = \\frac{P-Q}{D_r}$$
    $$d_{X|Y} = \\frac{P-Q}{D_c}$$
    
    The formula used for the symmetric version (SPSS, 2006, p. 121):
    $$d = \\frac{2\\times\\left(P-Q\\right)}{D_r + D_c}$$
    
    For the significance (p-value) the following is used (SPSS, 2006, p. 121):
    $$z_{d} = \\frac{d}{ASE\\left(d\\right)_0}$$
    $$ASE\\left(d\\right)_0 = \\frac{4}{D_r + D_c}\\times s$$
    $$ASE\\left(d\\right)_1 = \\frac{2\\times ASE\\left(\\tau_b\\right)_1}{D_r + D_c}\\times \\sqrt{D_r\\times D_c}$$
    
    $$z_{d_{Y|X}} = \\frac{d}{ASE\\left(d_{Y|X}\\right)_0}$$
    $$ASE\\left(d_{Y|X}\\right)_0 = \\frac{2}{D_r}\\times s$$
    $$ASE\\left(d_{Y|X}\\right)_1 = \\frac{2\\times\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(D_r\\times\\left(C_{i,j}-D_{i,j}\\right)-\\left(P-Q\\right)\\times\\left(n-RS_i\\right)\\right)^2}}{D_r^2}$$
    
    $$z_{d_{X|Y}} = \\frac{d}{ASE\\left(d_{X|Y}\\right)_0}$$
    $$ASE\\left(d_{X|Y}\\right)_0 = \\frac{2}{D_c}\\times s$$
    $$ASE\\left(d_{X|Y}\\right)_1 = \\frac{2\\times\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(D_r\\times\\left(C_{i,j}-D_{i,j}\\right)-\\left(P-Q\\right)\\times\\left(n-CS_j\\right)\\right)^2}}{D_c^2}$$
    
    With:
    $$P = \\sum_{i,j} P_{i,j}$$
    $$Q = \\sum_{i,j} Q_{i,j}$$
    $$P_{i,j} = F_{i,j}\\times C_{i,j}$$
    $$Q_{i,j} = F_{i,j}\\times D_{i,j}$$
    $$C_{i,j} = \\sum_{h<i}\\sum_{k<j} F_{h,k} + \\sum_{h>i}\\sum_{k>j} F_{h,k}$$
    $$D_{i,j} = \\sum_{h<i}\\sum_{k>j} F_{h,k} + \\sum_{h>i}\\sum_{k<j} F_{h,k}$$
    $$D_r = n^2 - \\sum_{i=1}^r RS_i^2$$
    $$D_c = n^2 - \\sum_{j=1}^c CS_j^2$$
    $$RS_i = \\sum_{j=1}^c F_{i,j}$$
    $$CS_j = \\sum_{i=1}^r F_{i,j}$$
    $$n = \\sum_{i=1}^r \\sum_{j=1}^c F_{i,j} = \\sum_{j=1}^c CS_j = \\sum_{i=1}^r RS_i$$
    $$s = \\sqrt{\\sum_{i=1}^r \\sum_{j=1}^c F_{i,j}\\times\\left(C_{i,j}-D_{i,j}\\right)^2 - \\frac{\\left(P-Q\\right)^2}{n}} $$
    
    *Symbols Used*
    
    * \\(F_{i,j}\\), the number of cases in row i, column j.
    * \\(n\\), the total sample size
    * \\(r\\), the number of rows
    * \\(c\\), the number of columns
    * \\(\\Phi\\left(\\dots\\right)\\), the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. *Advances in Methodology and Statistics, 8*(1). doi:10.51936/milh5641
    
    Somers, R. H. (1962). A new asymmetric measure of association for ordinal variables. *American Sociological Review, 27*(6), 799–811. doi:10.2307/2090408
    
    SPSS. (2006). SPSS 15.0 algorithms.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    
    '''
    
    ct = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2)
    k1 = ct.shape[0]
    k2 = ct.shape[1]
    
    if useRanks==False:
        if levels1 is not None:
            #replace row labels with numeric score
            ct = ct.reset_index(drop=True)
            
        if levels2 is not None:            
            ct.columns = [i for i in range(0, k2)]
    
    n = 0
    conc = [[0]*k1]*k2
    disc = [[0]*k1]*k2
    conc = pd.DataFrame(conc)
    disc = pd.DataFrame(disc)
    
    for i in range(0, k1):
        for j in range(0, k2):
            for h in range(0, k1):
                for k in range(0, k2):
                    
                    if useRanks:
                        if h > i and k > j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k<j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h>i and k<j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k>j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]                        
                    else:
                        if ct.index[h] > ct.index[i] and ct.columns[k] > ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] < ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] > ct.index[i] and ct.columns[k] < ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] > ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
            n = n + ct.iloc[i,j]
    
    ct = ct.reset_index(drop=True)
    ct.columns = [i for i in range(0, k2)]
    
    p = (ct*conc).sum().sum()
    q = (ct*disc).sum().sum()
    
    rs = ct.sum(axis=1)
    cs = ct.sum()
    
    dr = n**2 - (rs**2).sum()
    dc = n**2 - (cs**2).sum()
    
    dyx = (p - q)/dr
    dxy = (p - q)/dc
    d = 2 * (p - q) / (dr + dc)
    ds = [d, dxy, dyx]
    
    S = (ct*(conc-disc)**2).sum().sum()
    S = (S - (p - q)**2 / n)**0.5
    
    ase0 = 4 / (dr + dc) * S
    ase0yx = 2 / dr * S
    ase0xy = 2 / dc * S
        
    Z = d / ase0
    Zyx = dyx / ase0yx
    Zxy = dxy / ase0xy
            
    pVal = 2 * (1 - NormalDist().cdf(abs(Z)))
    pValyx = 2 * (1 - NormalDist().cdf(abs(Zyx)))
    pValxy = 2 * (1 - NormalDist().cdf(abs(Zxy)))
    
    r1 = ["symmetric", d, Z, pVal]
    r2 = ["field 1", dxy, Zxy, pValxy]
    r3 = ["field 2", dyx, Zyx, pValyx]
    res = pd.DataFrame([r1, r2, r3])
    res.columns = ["dependent", "Somers d", "statistic", "p-value"]
    
    return res

Functions

def r_somers_d(ordField1, ordField2, levels1=None, levels2=None, useRanks=False)

Somers D

A rank correlation coefficient. It ranges from -1 (perfect negative association) to 1 (perfect positive association). A zero would indicate no correlation at all.

A positive correlation indicates that if someone scored high on the first field, they also likely score high on the second, while a negative correlation would indicate a high score on the first would give a low score on the second.

Alternatives for Somers D are Gamma, Kendall Tau, and Stuart-Kendall Tau, but also Spearman rho could be considered.

Kendall Tau b looks at so-called discordant and concordant pairs, but unlike Gamma it does not ignore tied pairs. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.

Kendall Tau a is the same as Goodman-Kruskal Gamma.

Parameters

ordField1 : pandas series: the ordinal or scale scores of the first variable
ordField2 : pandas series: the ordinal or scale scores of the second variable
levels1 : list or dictionary, optional: the categories to use from ordField1
levels2 : list or dictionary, optional: the categories to use from ordField2
useRanks : boolean, optional: rank the data first or not. Default is False

Returns

A dataframe with:

dependent, which version (all three are in the rows)
Somers d, the Somers d value
statistic, the test statistic (z-value)
p-value, the p-value (significance)

Notes

The formula used for the asymmetric versions (Somers, 1962, p. 804): $d_{Y|X} = \frac{P-Q}{D_r}$ $d_{X|Y} = \frac{P-Q}{D_c}$

The formula used for the symmetric version (SPSS, 2006, p. 121): $d = \frac{2\times\left(P-Q\right)}{D_r + D_c}$

For the significance (p-value) the following is used (SPSS, 2006, p. 121): $z_{d} = \frac{d}{ASE\left(d\right)_0}$ $ASE\left(d\right)_0 = \frac{4}{D_r + D_c}\times s$ $ASE\left(d\right)_1 = \frac{2\times ASE\left(\tau_b\right)_1}{D_r + D_c}\times \sqrt{D_r\times D_c}$

$z_{d_{Y|X}} = \frac{d}{ASE\left(d_{Y|X}\right)_0}$ $ASE\left(d_{Y|X}\right)_0 = \frac{2}{D_r}\times s$ $ASE\left(d_{Y|X}\right)_1 = \frac{2\times\sqrt{\sum_{i,j} F_{i,j}\times\left(D_r\times\left(C_{i,j}-D_{i,j}\right)-\left(P-Q\right)\times\left(n-RS_i\right)\right)^2}}{D_r^2}$

$z_{d_{X|Y}} = \frac{d}{ASE\left(d_{X|Y}\right)_0}$ $ASE\left(d_{X|Y}\right)_0 = \frac{2}{D_c}\times s$ $ASE\left(d_{X|Y}\right)_1 = \frac{2\times\sqrt{\sum_{i,j} F_{i,j}\times\left(D_r\times\left(C_{i,j}-D_{i,j}\right)-\left(P-Q\right)\times\left(n-CS_j\right)\right)^2}}{D_c^2}$

With: $P = \sum_{i,j} P_{i,j}$ $Q = \sum_{i,j} Q_{i,j}$ $P_{i,j} = F_{i,j}\times C_{i,j}$ $Q_{i,j} = F_{i,j}\times D_{i,j}$ $C_{i,j} = \sum_{h<i}\sum_{k<j} F_{h,k} + \sum_{h>i}\sum_{k>j} F_{h,k}$ $D_{i,j} = \sum_{h<i}\sum_{k>j} F_{h,k} + \sum_{h>i}\sum_{k<j} F_{h,k}$ $D_r = n^2 - \sum_{i=1}^r RS_i^2$ $D_c = n^2 - \sum_{j=1}^c CS_j^2$ $RS_i = \sum_{j=1}^c F_{i,j}$ $CS_j = \sum_{i=1}^r F_{i,j}$ $n = \sum_{i=1}^r \sum_{j=1}^c F_{i,j} = \sum_{j=1}^c CS_j = \sum_{i=1}^r RS_i$ $s = \sqrt{\sum_{i=1}^r \sum_{j=1}^c F_{i,j}\times\left(C_{i,j}-D_{i,j}\right)^2 - \frac{\left(P-Q\right)^2}{n}}$

Symbols Used

$F_{i,j}$ , the number of cases in row i, column j.
$n$ , the total sample size
$r$ , the number of rows
$c$ , the number of columns
$\Phi\left(\dots\right)$ , the cumulative distribution function of the standard normal distribution.

References

Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. Advances in Methodology and Statistics, 8(1). doi:10.51936/milh5641

Somers, R. H. (1962). A new asymmetric measure of association for ordinal variables. American Sociological Review, 27(6), 799–811. doi:10.2307/2090408

SPSS. (2006). SPSS 15.0 algorithms.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def r_somers_d(ordField1, ordField2, levels1=None, levels2=None, useRanks=False):
    '''
    Somers D
    --------
    A rank correlation coefficient. It ranges from -1 (perfect negative association) to 1 (perfect positive association). A zero would indicate no correlation at all.
    
    A positive correlation indicates that if someone scored high on the first field, they also likely score high on the second, while a negative correlation would indicate a high score on the first would give a low score on the second.
    
    Alternatives for Somers D are Gamma, Kendall Tau, and Stuart-Kendall Tau, but also Spearman rho could be considered.
    
    Kendall Tau b looks at so-called discordant and concordant pairs, but unlike Gamma it does not ignore tied pairs. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.
    
    Kendall Tau a is the same as Goodman-Kruskal Gamma.
    
    Parameters
    ----------
    ordField1 : pandas series
        the ordinal or scale scores of the first variable
    ordField2 : pandas series
        the ordinal or scale scores of the second variable
    levels1 : list or dictionary, optional
        the categories to use from ordField1
    levels2 : list or dictionary, optional
        the categories to use from ordField2
    useRanks : boolean, optional
        rank the data first or not. Default is False
        
    Returns
    -------
    A dataframe with:
    
    * *dependent*, which version (all three are in the rows)
    * *Somers d*, the Somers d value
    * *statistic*, the test statistic (z-value)
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    
    The formula used for the asymmetric versions (Somers, 1962, p. 804):
    $$d_{Y|X} = \\frac{P-Q}{D_r}$$
    $$d_{X|Y} = \\frac{P-Q}{D_c}$$
    
    The formula used for the symmetric version (SPSS, 2006, p. 121):
    $$d = \\frac{2\\times\\left(P-Q\\right)}{D_r + D_c}$$
    
    For the significance (p-value) the following is used (SPSS, 2006, p. 121):
    $$z_{d} = \\frac{d}{ASE\\left(d\\right)_0}$$
    $$ASE\\left(d\\right)_0 = \\frac{4}{D_r + D_c}\\times s$$
    $$ASE\\left(d\\right)_1 = \\frac{2\\times ASE\\left(\\tau_b\\right)_1}{D_r + D_c}\\times \\sqrt{D_r\\times D_c}$$
    
    $$z_{d_{Y|X}} = \\frac{d}{ASE\\left(d_{Y|X}\\right)_0}$$
    $$ASE\\left(d_{Y|X}\\right)_0 = \\frac{2}{D_r}\\times s$$
    $$ASE\\left(d_{Y|X}\\right)_1 = \\frac{2\\times\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(D_r\\times\\left(C_{i,j}-D_{i,j}\\right)-\\left(P-Q\\right)\\times\\left(n-RS_i\\right)\\right)^2}}{D_r^2}$$
    
    $$z_{d_{X|Y}} = \\frac{d}{ASE\\left(d_{X|Y}\\right)_0}$$
    $$ASE\\left(d_{X|Y}\\right)_0 = \\frac{2}{D_c}\\times s$$
    $$ASE\\left(d_{X|Y}\\right)_1 = \\frac{2\\times\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(D_r\\times\\left(C_{i,j}-D_{i,j}\\right)-\\left(P-Q\\right)\\times\\left(n-CS_j\\right)\\right)^2}}{D_c^2}$$
    
    With:
    $$P = \\sum_{i,j} P_{i,j}$$
    $$Q = \\sum_{i,j} Q_{i,j}$$
    $$P_{i,j} = F_{i,j}\\times C_{i,j}$$
    $$Q_{i,j} = F_{i,j}\\times D_{i,j}$$
    $$C_{i,j} = \\sum_{h<i}\\sum_{k<j} F_{h,k} + \\sum_{h>i}\\sum_{k>j} F_{h,k}$$
    $$D_{i,j} = \\sum_{h<i}\\sum_{k>j} F_{h,k} + \\sum_{h>i}\\sum_{k<j} F_{h,k}$$
    $$D_r = n^2 - \\sum_{i=1}^r RS_i^2$$
    $$D_c = n^2 - \\sum_{j=1}^c CS_j^2$$
    $$RS_i = \\sum_{j=1}^c F_{i,j}$$
    $$CS_j = \\sum_{i=1}^r F_{i,j}$$
    $$n = \\sum_{i=1}^r \\sum_{j=1}^c F_{i,j} = \\sum_{j=1}^c CS_j = \\sum_{i=1}^r RS_i$$
    $$s = \\sqrt{\\sum_{i=1}^r \\sum_{j=1}^c F_{i,j}\\times\\left(C_{i,j}-D_{i,j}\\right)^2 - \\frac{\\left(P-Q\\right)^2}{n}} $$
    
    *Symbols Used*
    
    * \\(F_{i,j}\\), the number of cases in row i, column j.
    * \\(n\\), the total sample size
    * \\(r\\), the number of rows
    * \\(c\\), the number of columns
    * \\(\\Phi\\left(\\dots\\right)\\), the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. *Advances in Methodology and Statistics, 8*(1). doi:10.51936/milh5641
    
    Somers, R. H. (1962). A new asymmetric measure of association for ordinal variables. *American Sociological Review, 27*(6), 799–811. doi:10.2307/2090408
    
    SPSS. (2006). SPSS 15.0 algorithms.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    
    '''
    
    ct = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2)
    k1 = ct.shape[0]
    k2 = ct.shape[1]
    
    if useRanks==False:
        if levels1 is not None:
            #replace row labels with numeric score
            ct = ct.reset_index(drop=True)
            
        if levels2 is not None:            
            ct.columns = [i for i in range(0, k2)]
    
    n = 0
    conc = [[0]*k1]*k2
    disc = [[0]*k1]*k2
    conc = pd.DataFrame(conc)
    disc = pd.DataFrame(disc)
    
    for i in range(0, k1):
        for j in range(0, k2):
            for h in range(0, k1):
                for k in range(0, k2):
                    
                    if useRanks:
                        if h > i and k > j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k<j:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif h>i and k<j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif h<i and k>j:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]                        
                    else:
                        if ct.index[h] > ct.index[i] and ct.columns[k] > ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] < ct.columns[j]:
                            conc.iloc[i,j] = conc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] > ct.index[i] and ct.columns[k] < ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
                        elif ct.index[h] < ct.index[i] and ct.columns[k] > ct.columns[j]:
                            disc.iloc[i,j] = disc.iloc[i,j] + ct.iloc[h,k]
            n = n + ct.iloc[i,j]
    
    ct = ct.reset_index(drop=True)
    ct.columns = [i for i in range(0, k2)]
    
    p = (ct*conc).sum().sum()
    q = (ct*disc).sum().sum()
    
    rs = ct.sum(axis=1)
    cs = ct.sum()
    
    dr = n**2 - (rs**2).sum()
    dc = n**2 - (cs**2).sum()
    
    dyx = (p - q)/dr
    dxy = (p - q)/dc
    d = 2 * (p - q) / (dr + dc)
    ds = [d, dxy, dyx]
    
    S = (ct*(conc-disc)**2).sum().sum()
    S = (S - (p - q)**2 / n)**0.5
    
    ase0 = 4 / (dr + dc) * S
    ase0yx = 2 / dr * S
    ase0xy = 2 / dc * S
        
    Z = d / ase0
    Zyx = dyx / ase0yx
    Zxy = dxy / ase0xy
            
    pVal = 2 * (1 - NormalDist().cdf(abs(Z)))
    pValyx = 2 * (1 - NormalDist().cdf(abs(Zyx)))
    pValxy = 2 * (1 - NormalDist().cdf(abs(Zxy)))
    
    r1 = ["symmetric", d, Z, pVal]
    r2 = ["field 1", dxy, Zxy, pValxy]
    r3 = ["field 2", dyx, Zyx, pValyx]
    res = pd.DataFrame([r1, r2, r3])
    res.columns = ["dependent", "Somers d", "statistic", "p-value"]
    
    return res