Module `stikpetP.correlations.cor_spearman_rho`

Expand source code

import pandas as pd
from statistics import NormalDist
from ..other.table_cross import tab_cross
from ..distributions.dist_spearman import di_scdf

def r_spearman_rho(ordField1, ordField2, levels1=None, levels2=None, test="t", cc=False):
    '''
    Spearman Rho / Spearman Rank Correlation Coefficient
    ----------------------------------------------------
    Spearman rho varies between -1 and +1. If it is -1 there is a perfect negative monotonic relationship, if it is 0 there is no monotonic relationship and at +1 there is a perfect positive monotonic relationship. Monotonic means that it is constantly increasing or constantly decreasing. A positive relation means that if one variable goes up, the other also goes up (for example number of ice cream sold versus temperature), a negative relation indicates if one goes down, the other goes up (for example number of winter jackets sold versus temperature).
    
    We can test if Spearman rho might be significantly different from 0 in the population. 
    
    Alternatives are Somers D, Gamma, Kendall Tau, and Stuart-Kendall Tau.
    
    Kendall Tau b looks at so-called discordant and concordant pairs, but unlike Gamma it does not ignore tied pairs. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.
    
    Parameters
    ----------
    ordField1 : pandas series
        the ordinal or scale scores of the first variable
    ordField2 : pandas series
        the ordinal or scale scores of the second variable
    levels1 : list or dictionary, optional
        the categories to use from ordField1
    levels2 : list or dictionary, optional
        the categories to use from ordField2
    test : {"t", "as89", "exact", "iman-conover", "z-fieller", "z-olds", "none"}, optional
        which test to use
    cc : boolean, optional
        to indicate the use of a continuity correction
        
    Returns
    -------
    A dataframe with, depending on the test:
    
    * *Spearman rho*, the correlation coefficient value
    * *p-value*, the p-value (significance)
    * *statistic*, the test statistic
    * *df*, the degrees of freedom
    
    If "none" is used only the value is returned.
    
    Notes
    -----
    The formula used (Spearman, 1904, p. 77):
    $$r_s = \\frac{\\sum_{i=1}^n \\left(R_{x,i} - \\bar{R}_x\\right)\\times \\left(R_{y,i} - \\bar{R}_y\\right)}{\\sqrt{SS_{R_x}\\times SS_{R_y}}}$$
    
    With:
    $$SS_{R_x} = \\sum_{i=1}^{n_x} \\left(R_{x,i} - \\bar{R}_x\\right)^2$$
    $$SS_{R_y} = \\sum_{i=1}^{n_y} \\left(R_{y,i} - \\bar{R}_y\\right)^2$$
    $$\\bar{R}_x = \\frac{\\sum_{i=1}^{n_x} R_{x,i}}{n_x}$$
    $$\\bar{R}_y = \\frac{\\sum_{i=1}^{n_y} R_{y,i}}{n_y}$$
    
    *Symbols Used:*
    
    * \\(R_{x_i}\\), is the rank of the i-th score from variable x
    * \\(R_{y_i}\\), is the rank of the i-th score from variable y
    * \\(\\bar{R}_x\\), is the average rank of the scores from variable x
    * \\(\\bar{R}_y\\), is the average rank of the scores from variable y
    * \\(n_x\\), is the number of scores from variable x
    * \\(n_y\\), is the number of scores from variable y
    * \\(n\\), is the number of pairs of x and y
    
    See the details of di_scdf() for calculations of the Spearman rank distribution.
    
    A continuity correction can be applied (Zar, 1972, p. 579):
    $$r_s^{cc} = \\left|r_s\\right| - \\frac{6}{n^3 - n}$$
    
    References
    ----------
    Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. *Advances in Methodology and Statistics, 8*(1). doi:10.51936/milh5641
    
    Spearman, C. (1904). The proof and measurement of association between two things. *The American Journal of Psychology, 15*(1), 72–101.
    
    Zar, J. H. (1972). Significance testing of the Spearman rank correlation coefficient. *Journal of the American Statistical Association, 67*(339), 578–580. doi:10.1080/01621459.1972.10481251
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    '''
    
    ct = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2)
    k1 = ct.shape[0]
    k2 = ct.shape[1]
    

    #the ranks of the levels
    ct2 = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2, totals="include")
    lvlRank1 = []
    cf = 0
    for i in range(0, k1):
        lvlRank1.append((2 * cf + ct2.iloc[i, k2] + 1) / 2)
        cf = cf + ct2.iloc[i, k2] 
    
    lvlRank2 = []
    cf = 0
    for i in range(0, k2):
        lvlRank2.append((2 * cf + ct2.iloc[k1, i] + 1) / 2)
        cf = cf + ct2.iloc[k1, i]
    
    #replace row labels with numeric score
    ct.index = lvlRank1
    ct.columns = lvlRank2
    
    
    rs = ct.sum(axis=1)
    cs = ct.sum()
    
    
    sr1 = (rs*ct.index).sum()
    sr2 = (cs*ct.columns).sum()
    
    n1 = rs.sum()
    n2 = cs.sum()
    mr1 = sr1 / n1
    mr2 = sr2 / n2
    
    ss1 = (rs * (ct.index - mr1)**2).sum()
    ss2 = (cs * (ct.columns - mr2)**2).sum()
    
    cov = 0
    for i in range(0, k1):
        for j in range(0, k2):
            cov = cov + ct.iloc[i,j]*(ct.index[i] - mr1) * (ct.columns[j] - mr2)
        
    r = cov / (ss1 * ss2)**0.5
    
    if cc and test != "exact":
        r = abs(r) - 6/(n1**3 - n1)
    
    if test == "none":
        res = r
    else:
        diRes = di_scdf(ordField1, ordField2, levels1, levels2, test)
        
        if test in ["as89", "exact", "iman-conover"]:
            pCdf = diRes
        else:
            pCdf = diRes.iloc[0, 0]        
        
        if r > 0:
            p = 2 * (1 - pCdf)
        else:
            p = 2 * pCdf
        
        if test in ["as89", "exact", "iman-conover"]:
            res = pd.DataFrame([[r, p]])
            res.columns = ["Spearman rho", "p-value"]
        
        elif test == "t":
            res = pd.DataFrame([[r, p, diRes.iloc[0,1], diRes.iloc[0,2]]])
            res.columns = ["Spearman rho", "p-value", "statistic", "df"]
            
        elif test == "z-fieller" or test == "z-olds":
            res = pd.DataFrame([[r, p, diRes.iloc[0,1]]])
            res.columns = ["Spearman rho", "p-value", "statistic"]
            
    return res

Functions

def r_spearman_rho(ordField1, ordField2, levels1=None, levels2=None, test='t', cc=False)

Spearman Rho / Spearman Rank Correlation Coefficient

Spearman rho varies between -1 and +1. If it is -1 there is a perfect negative monotonic relationship, if it is 0 there is no monotonic relationship and at +1 there is a perfect positive monotonic relationship. Monotonic means that it is constantly increasing or constantly decreasing. A positive relation means that if one variable goes up, the other also goes up (for example number of ice cream sold versus temperature), a negative relation indicates if one goes down, the other goes up (for example number of winter jackets sold versus temperature).

We can test if Spearman rho might be significantly different from 0 in the population.

Alternatives are Somers D, Gamma, Kendall Tau, and Stuart-Kendall Tau.

Kendall Tau b looks at so-called discordant and concordant pairs, but unlike Gamma it does not ignore tied pairs. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.

Parameters

ordField1 : pandas series: the ordinal or scale scores of the first variable
ordField2 : pandas series: the ordinal or scale scores of the second variable
levels1 : list or dictionary, optional: the categories to use from ordField1
levels2 : list or dictionary, optional: the categories to use from ordField2
test : {"t", "as89", "exact", "iman-conover", "z-fieller", "z-olds", "none"}, optional: which test to use
cc : boolean, optional: to indicate the use of a continuity correction

Returns

A dataframe with, depending on the test:

Spearman rho, the correlation coefficient value
p-value, the p-value (significance)
statistic, the test statistic
df, the degrees of freedom

If "none" is used only the value is returned.

Notes

The formula used (Spearman, 1904, p. 77): $r_s = \frac{\sum_{i=1}^n \left(R_{x,i} - \bar{R}_x\right)\times \left(R_{y,i} - \bar{R}_y\right)}{\sqrt{SS_{R_x}\times SS_{R_y}}}$

With: $SS_{R_x} = \sum_{i=1}^{n_x} \left(R_{x,i} - \bar{R}_x\right)^2$ $SS_{R_y} = \sum_{i=1}^{n_y} \left(R_{y,i} - \bar{R}_y\right)^2$ $\bar{R}_x = \frac{\sum_{i=1}^{n_x} R_{x,i}}{n_x}$ $\bar{R}_y = \frac{\sum_{i=1}^{n_y} R_{y,i}}{n_y}$

Symbols Used:

$R_{x_i}$ , is the rank of the i-th score from variable x
$R_{y_i}$ , is the rank of the i-th score from variable y
$\bar{R}_x$ , is the average rank of the scores from variable x
$\bar{R}_y$ , is the average rank of the scores from variable y
$n_x$ , is the number of scores from variable x
$n_y$ , is the number of scores from variable y
$n$ , is the number of pairs of x and y

See the details of di_scdf() for calculations of the Spearman rank distribution.

A continuity correction can be applied (Zar, 1972, p. 579): $r_s^{cc} = \left|r_s\right| - \frac{6}{n^3 - n}$

References

Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. Advances in Methodology and Statistics, 8(1). doi:10.51936/milh5641

Spearman, C. (1904). The proof and measurement of association between two things. The American Journal of Psychology, 15(1), 72–101.

Zar, J. H. (1972). Significance testing of the Spearman rank correlation coefficient. Journal of the American Statistical Association, 67(339), 578–580. doi:10.1080/01621459.1972.10481251

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def r_spearman_rho(ordField1, ordField2, levels1=None, levels2=None, test="t", cc=False):
    '''
    Spearman Rho / Spearman Rank Correlation Coefficient
    ----------------------------------------------------
    Spearman rho varies between -1 and +1. If it is -1 there is a perfect negative monotonic relationship, if it is 0 there is no monotonic relationship and at +1 there is a perfect positive monotonic relationship. Monotonic means that it is constantly increasing or constantly decreasing. A positive relation means that if one variable goes up, the other also goes up (for example number of ice cream sold versus temperature), a negative relation indicates if one goes down, the other goes up (for example number of winter jackets sold versus temperature).
    
    We can test if Spearman rho might be significantly different from 0 in the population. 
    
    Alternatives are Somers D, Gamma, Kendall Tau, and Stuart-Kendall Tau.
    
    Kendall Tau b looks at so-called discordant and concordant pairs, but unlike Gamma it does not ignore tied pairs. Stuart-Kendall Tau c also, but also takes the size of the table into consideration. Somers d only makes a correction for tied pairs in one of the two directions. Spearman rho is more of a variation on Pearson correlation, but applied to ranks. See Göktaş and İşçi. (2011) for more information on the comparisons.
    
    Parameters
    ----------
    ordField1 : pandas series
        the ordinal or scale scores of the first variable
    ordField2 : pandas series
        the ordinal or scale scores of the second variable
    levels1 : list or dictionary, optional
        the categories to use from ordField1
    levels2 : list or dictionary, optional
        the categories to use from ordField2
    test : {"t", "as89", "exact", "iman-conover", "z-fieller", "z-olds", "none"}, optional
        which test to use
    cc : boolean, optional
        to indicate the use of a continuity correction
        
    Returns
    -------
    A dataframe with, depending on the test:
    
    * *Spearman rho*, the correlation coefficient value
    * *p-value*, the p-value (significance)
    * *statistic*, the test statistic
    * *df*, the degrees of freedom
    
    If "none" is used only the value is returned.
    
    Notes
    -----
    The formula used (Spearman, 1904, p. 77):
    $$r_s = \\frac{\\sum_{i=1}^n \\left(R_{x,i} - \\bar{R}_x\\right)\\times \\left(R_{y,i} - \\bar{R}_y\\right)}{\\sqrt{SS_{R_x}\\times SS_{R_y}}}$$
    
    With:
    $$SS_{R_x} = \\sum_{i=1}^{n_x} \\left(R_{x,i} - \\bar{R}_x\\right)^2$$
    $$SS_{R_y} = \\sum_{i=1}^{n_y} \\left(R_{y,i} - \\bar{R}_y\\right)^2$$
    $$\\bar{R}_x = \\frac{\\sum_{i=1}^{n_x} R_{x,i}}{n_x}$$
    $$\\bar{R}_y = \\frac{\\sum_{i=1}^{n_y} R_{y,i}}{n_y}$$
    
    *Symbols Used:*
    
    * \\(R_{x_i}\\), is the rank of the i-th score from variable x
    * \\(R_{y_i}\\), is the rank of the i-th score from variable y
    * \\(\\bar{R}_x\\), is the average rank of the scores from variable x
    * \\(\\bar{R}_y\\), is the average rank of the scores from variable y
    * \\(n_x\\), is the number of scores from variable x
    * \\(n_y\\), is the number of scores from variable y
    * \\(n\\), is the number of pairs of x and y
    
    See the details of di_scdf() for calculations of the Spearman rank distribution.
    
    A continuity correction can be applied (Zar, 1972, p. 579):
    $$r_s^{cc} = \\left|r_s\\right| - \\frac{6}{n^3 - n}$$
    
    References
    ----------
    Göktaş, A., & İşçi, Ö. (2011). A comparison of the most commonly used measures of association for doubly ordered square contingency tables via simulation. *Advances in Methodology and Statistics, 8*(1). doi:10.51936/milh5641
    
    Spearman, C. (1904). The proof and measurement of association between two things. *The American Journal of Psychology, 15*(1), 72–101.
    
    Zar, J. H. (1972). Significance testing of the Spearman rank correlation coefficient. *Journal of the American Statistical Association, 67*(339), 578–580. doi:10.1080/01621459.1972.10481251
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    '''
    
    ct = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2)
    k1 = ct.shape[0]
    k2 = ct.shape[1]
    

    #the ranks of the levels
    ct2 = tab_cross(ordField1, ordField2, order1=levels1, order2=levels2, totals="include")
    lvlRank1 = []
    cf = 0
    for i in range(0, k1):
        lvlRank1.append((2 * cf + ct2.iloc[i, k2] + 1) / 2)
        cf = cf + ct2.iloc[i, k2] 
    
    lvlRank2 = []
    cf = 0
    for i in range(0, k2):
        lvlRank2.append((2 * cf + ct2.iloc[k1, i] + 1) / 2)
        cf = cf + ct2.iloc[k1, i]
    
    #replace row labels with numeric score
    ct.index = lvlRank1
    ct.columns = lvlRank2
    
    
    rs = ct.sum(axis=1)
    cs = ct.sum()
    
    
    sr1 = (rs*ct.index).sum()
    sr2 = (cs*ct.columns).sum()
    
    n1 = rs.sum()
    n2 = cs.sum()
    mr1 = sr1 / n1
    mr2 = sr2 / n2
    
    ss1 = (rs * (ct.index - mr1)**2).sum()
    ss2 = (cs * (ct.columns - mr2)**2).sum()
    
    cov = 0
    for i in range(0, k1):
        for j in range(0, k2):
            cov = cov + ct.iloc[i,j]*(ct.index[i] - mr1) * (ct.columns[j] - mr2)
        
    r = cov / (ss1 * ss2)**0.5
    
    if cc and test != "exact":
        r = abs(r) - 6/(n1**3 - n1)
    
    if test == "none":
        res = r
    else:
        diRes = di_scdf(ordField1, ordField2, levels1, levels2, test)
        
        if test in ["as89", "exact", "iman-conover"]:
            pCdf = diRes
        else:
            pCdf = diRes.iloc[0, 0]        
        
        if r > 0:
            p = 2 * (1 - pCdf)
        else:
            p = 2 * pCdf
        
        if test in ["as89", "exact", "iman-conover"]:
            res = pd.DataFrame([[r, p]])
            res.columns = ["Spearman rho", "p-value"]
        
        elif test == "t":
            res = pd.DataFrame([[r, p, diRes.iloc[0,1], diRes.iloc[0,2]]])
            res.columns = ["Spearman rho", "p-value", "statistic", "df"]
            
        elif test == "z-fieller" or test == "z-olds":
            res = pd.DataFrame([[r, p, diRes.iloc[0,1]]])
            res.columns = ["Spearman rho", "p-value", "statistic"]
            
    return res