Module `stikpetP.tests.test_friedman`

Expand source code

import pandas as pd
from scipy.stats import rankdata
from scipy.stats import f
from scipy.stats import chi2
from statistics import NormalDist

def ts_friedman(data, levels=None, ties=True, dist="chi"):
    '''
    Friedman Test
    -------------
    A test to determine if any of the variables has a significant different average ranking than any of the others.
    
    It is a paired-samples version of a Kruskal-Wallis test. If the p-value is below a pre-defined threshold (usually 0.05) it indicates at least one variable (column) is different than another.
    
    Parameters
    ----------
    data : dataframe
        dataframe with a column for each variable
    levels : dataframe or dictionary, optional
        indication of what the levels are in order
    ties : boolean, optional
        apply a ties correction. Default is True    
    dist : {"chi", "f", "normal"}, optional
        distribution to use. Default is "chi"
        
    Returns
    -------
    res : dataframe
        with the following columns
        
    * *n*, sample size
    * *statistic", test statistic used
    * *df*, *df1*, *df2", degrees of freedom (if applicable)
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used in case of no ties (Friedman, 1937, p. 679):
    $$\\chi_F^2 = \\left(\\frac{12}{n\\times k\\times\\left(k+1\\right)}\\times\\sum_{j=1}^k R_j^2\\right)-3\\times n\\times\\left(k+1\\right)$$
    $$df = k - 1$$
    
    With:
    $$R_j = \\sum_{i=1}^n r_{i,j}$$
    
    In case a ties correction is used (Hollander & Wolfe, 1999, p. 274):
    $$\\chi_{Fadj}^2 = \\frac{12\\times \\sum_{j=1}^k R_j^2 - 3\\times n^2\\times\\left(k +1\\right)^2}{n\\times\\left(k+1\\right) - \\frac{\\left(\\sum t_{i,j}^3\\right)-k}{k-1}}$$
    
    The ties correction used by IBM SPSS (2021, p. 811) will give the same result:
    $$\\chi_{Fadj}^2 = \\frac{\\chi_F^2}{1 - \\frac{\\sum t_{i,j}^3 - t_{i,j}}{n\\times\\left(k^3-k\\right)}}$$
    
    The function uses more of a one-way ANOVA approach in case of ties, but then on the ranks. It leads to the same result:
    $$\\chi_{Fadj}^2 = \\frac{n\\times\\sum_{j=1}^k\\left(\\bar{r}_j -\\bar{r}\\right)^2}{\\left(\\frac{\\sum_{j=1}^k\\sum_{i=1}^n \\left(r_{i,j}-\\bar{r}\\right)^2}{n\\times\\left(k-1\\right)}\\right)}$$
    
    With:
    $$\\bar{r}_j = \\frac{R_j}{n}$$
    $$\\bar{r} = \\frac{\\sum_{j=1}^k R_j}{n\\times k} = \\frac{n\\times\\left(k+1\\right)}{2}$$
    
    The significance is then determined using:
    $$sig. = 1 - \\chi^2\\left(\\chi_F^2, df\\right)$$
    
    A normal distribution approximation was proposed by Friedman (1937, p. 695; 1939, p. 109):
    $$z_F = \\frac{\\chi_F^2-\\left(k-1\\right)}{\\sqrt{2\\times\\frac{n-1}{n}\\times\\left(k-1\\right)}}$$
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|z_F\\right|\\right)\\right)$$
    
    And an F distribution by Iman and Davenport (1980, p. 573):
    $$F_F = \\frac{\\left(n-1\\right)\\times\\chi_F^2}{n\\times\\left(k-1\\right)-\\chi_F^2}$$
    $$df_1 = k - 1$$
    $$df_2 = \\left(k-1\\right)\\times\\left(n - 1\\right)$$
    $$sig. = 1 - F\\left(F_{F}, df_1, df_2\\right)$$
    
    Some might refer to Conover for this F-distribution, but in Conover (1980, p. 300) it seems Conover credits Iman and Davenport himself. 
    
    *Symbols Used*
    
    * \\(n\\), the number of cases
    * \\(k\\), the number of variables
    * \\(r_{i,j}\\), the rank of case i, in variable j. The ranks are determined for each case.
    * \\(t_{i,j}\\), the frequency of unique rank j, in case i. For each row the frequencies of each rank is determined in the calculations.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley.
    
    Friedman, M. (1937). The use of ranks to avoid the assumption of normality implicit in the analysis of variance. *Journal of the American Statistical Association, 32*(200), 675–701. doi:10.2307/2279372
    
    Friedman, M. (1939). A correction. *Journal of the American Statistical Association, 34*(205), 109–109. doi:10.1080/01621459.1939.10502372
    
    Hollander, M., & Wolfe, D. A. (1999). *Nonparametric statistical methods* (2nd ed.). Wiley.
    
    IBM. (2021). IBM SPSS Statistics Algorithms. IBM.
    
    Iman, R., & Davenport, J. (1980). Approximations of the critical region of the Friedman statistic. *Communications in Statistics-Theory and Methods, 9*, 571–595.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data = data.reset_index(drop=True)        
        
    n = len(data)
    k = len(data.columns)
    
    if levels is not None:
        data = data.replace(levels)
    
    ranks = pd.DataFrame()
    for i in range(0, n):
        rankRow = pd.Series(rankdata(data.iloc[i, :]))
        ranks[i] = rankRow
    
    ranks = ranks.transpose()
    
    rs = ranks.sum().sum()
    rm = rs/(n*k)
    
    #Determine for each variable the average rank, and
    #the squared difference of this average with the overall average rank.
    rmj = ranks.sum()/n
    rs2 = ((rmj*n)**2).sum()
    sst = n*((rmj - rm)**2).sum()
    sse = ranks.stack().var(ddof=0)*k/(k-1)
    
    if ties:
        # Hollander and Wolfe (1999, p. 274) rewritten:
        qadj = sst / sse
    else:
        # Friedman (1937, p. 679):
        qadj = 12 / (n * k * (k + 1)) * rs2 - 3 * n * (k + 1)
    
    df = k - 1
    
    if dist=="f":
        # Iman-Davenport F Distribution (1980, p. 573)
        fVal = (n - 1) * qadj / (n * (k - 1) - qadj)
        df1 = df
        df2 = (k - 1) * (n - 1)
        p = f.sf(fVal, df1, df2)
        
        res = pd.DataFrame([[n, fVal, df1, df2, p]])
        res.columns = ["n", "statistic", "df1", "df2", "p-value"]
    
    elif dist=="normal":        
        # Friedman Normal Distribution (1937, p. 695; 1939, p. 109):
        z = (qadj - (k - 1)) / (2 * (n - 1) / n * (k - 1))**0.5
        p = 2 * (1 - NormalDist().cdf(abs(z))) 
        res = pd.DataFrame([[n, z, p]])
        res.columns = ["n", "statistic", "p-value"]
        
    else:
        #Friedman Chi-Square Distribution
        p = chi2.sf(qadj, df)
        res = pd.DataFrame([[n, qadj, df, p]])
        res.columns = ["n", "statistic", "df", "p-value"]
    
    return res

Functions

def ts_friedman(data, levels=None, ties=True, dist='chi')

Friedman Test

A test to determine if any of the variables has a significant different average ranking than any of the others.

It is a paired-samples version of a Kruskal-Wallis test. If the p-value is below a pre-defined threshold (usually 0.05) it indicates at least one variable (column) is different than another.

Parameters

data : dataframe: dataframe with a column for each variable
levels : dataframe or dictionary, optional: indication of what the levels are in order
ties : boolean, optional: apply a ties correction. Default is True
dist : {"chi", "f", "normal"}, optional: distribution to use. Default is "chi"

Returns

res : dataframe: with the following columns

n, sample size
*statistic", test statistic used
df, df1, *df2", degrees of freedom (if applicable)
p-value, the p-value (significance)

Notes

The formula used in case of no ties (Friedman, 1937, p. 679): $\chi_F^2 = \left(\frac{12}{n\times k\times\left(k+1\right)}\times\sum_{j=1}^k R_j^2\right)-3\times n\times\left(k+1\right)$ $df = k - 1$

With: $R_j = \sum_{i=1}^n r_{i,j}$

In case a ties correction is used (Hollander & Wolfe, 1999, p. 274): $\chi_{Fadj}^2 = \frac{12\times \sum_{j=1}^k R_j^2 - 3\times n^2\times\left(k +1\right)^2}{n\times\left(k+1\right) - \frac{\left(\sum t_{i,j}^3\right)-k}{k-1}}$

The ties correction used by IBM SPSS (2021, p. 811) will give the same result: $\chi_{Fadj}^2 = \frac{\chi_F^2}{1 - \frac{\sum t_{i,j}^3 - t_{i,j}}{n\times\left(k^3-k\right)}}$

The function uses more of a one-way ANOVA approach in case of ties, but then on the ranks. It leads to the same result: $\chi_{Fadj}^2 = \frac{n\times\sum_{j=1}^k\left(\bar{r}_j -\bar{r}\right)^2}{\left(\frac{\sum_{j=1}^k\sum_{i=1}^n \left(r_{i,j}-\bar{r}\right)^2}{n\times\left(k-1\right)}\right)}$

With: $\bar{r}_j = \frac{R_j}{n}$ $\bar{r} = \frac{\sum_{j=1}^k R_j}{n\times k} = \frac{n\times\left(k+1\right)}{2}$

The significance is then determined using: $sig. = 1 - \chi^2\left(\chi_F^2, df\right)$

A normal distribution approximation was proposed by Friedman (1937, p. 695; 1939, p. 109): $z_F = \frac{\chi_F^2-\left(k-1\right)}{\sqrt{2\times\frac{n-1}{n}\times\left(k-1\right)}}$ $sig. = 2\times\left(1 - \Phi\left(\left|z_F\right|\right)\right)$

And an F distribution by Iman and Davenport (1980, p. 573): $F_F = \frac{\left(n-1\right)\times\chi_F^2}{n\times\left(k-1\right)-\chi_F^2}$ $df_1 = k - 1$ $df_2 = \left(k-1\right)\times\left(n - 1\right)$ $sig. = 1 - F\left(F_{F}, df_1, df_2\right)$

Some might refer to Conover for this F-distribution, but in Conover (1980, p. 300) it seems Conover credits Iman and Davenport himself.

Symbols Used

$n$ , the number of cases
$k$ , the number of variables
$r_{i,j}$ , the rank of case i, in variable j. The ranks are determined for each case.
$t_{i,j}$ , the frequency of unique rank j, in case i. For each row the frequencies of each rank is determined in the calculations.

References

Conover, W. J. (1980). Practical nonparametric statistics (2nd ed.). Wiley.

Friedman, M. (1937). The use of ranks to avoid the assumption of normality implicit in the analysis of variance. Journal of the American Statistical Association, 32(200), 675–701. doi:10.2307/2279372

Friedman, M. (1939). A correction. Journal of the American Statistical Association, 34(205), 109–109. doi:10.1080/01621459.1939.10502372

Hollander, M., & Wolfe, D. A. (1999). Nonparametric statistical methods (2nd ed.). Wiley.

IBM. (2021). IBM SPSS Statistics Algorithms. IBM.

Iman, R., & Davenport, J. (1980). Approximations of the critical region of the Friedman statistic. Communications in Statistics-Theory and Methods, 9, 571–595.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def ts_friedman(data, levels=None, ties=True, dist="chi"):
    '''
    Friedman Test
    -------------
    A test to determine if any of the variables has a significant different average ranking than any of the others.
    
    It is a paired-samples version of a Kruskal-Wallis test. If the p-value is below a pre-defined threshold (usually 0.05) it indicates at least one variable (column) is different than another.
    
    Parameters
    ----------
    data : dataframe
        dataframe with a column for each variable
    levels : dataframe or dictionary, optional
        indication of what the levels are in order
    ties : boolean, optional
        apply a ties correction. Default is True    
    dist : {"chi", "f", "normal"}, optional
        distribution to use. Default is "chi"
        
    Returns
    -------
    res : dataframe
        with the following columns
        
    * *n*, sample size
    * *statistic", test statistic used
    * *df*, *df1*, *df2", degrees of freedom (if applicable)
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used in case of no ties (Friedman, 1937, p. 679):
    $$\\chi_F^2 = \\left(\\frac{12}{n\\times k\\times\\left(k+1\\right)}\\times\\sum_{j=1}^k R_j^2\\right)-3\\times n\\times\\left(k+1\\right)$$
    $$df = k - 1$$
    
    With:
    $$R_j = \\sum_{i=1}^n r_{i,j}$$
    
    In case a ties correction is used (Hollander & Wolfe, 1999, p. 274):
    $$\\chi_{Fadj}^2 = \\frac{12\\times \\sum_{j=1}^k R_j^2 - 3\\times n^2\\times\\left(k +1\\right)^2}{n\\times\\left(k+1\\right) - \\frac{\\left(\\sum t_{i,j}^3\\right)-k}{k-1}}$$
    
    The ties correction used by IBM SPSS (2021, p. 811) will give the same result:
    $$\\chi_{Fadj}^2 = \\frac{\\chi_F^2}{1 - \\frac{\\sum t_{i,j}^3 - t_{i,j}}{n\\times\\left(k^3-k\\right)}}$$
    
    The function uses more of a one-way ANOVA approach in case of ties, but then on the ranks. It leads to the same result:
    $$\\chi_{Fadj}^2 = \\frac{n\\times\\sum_{j=1}^k\\left(\\bar{r}_j -\\bar{r}\\right)^2}{\\left(\\frac{\\sum_{j=1}^k\\sum_{i=1}^n \\left(r_{i,j}-\\bar{r}\\right)^2}{n\\times\\left(k-1\\right)}\\right)}$$
    
    With:
    $$\\bar{r}_j = \\frac{R_j}{n}$$
    $$\\bar{r} = \\frac{\\sum_{j=1}^k R_j}{n\\times k} = \\frac{n\\times\\left(k+1\\right)}{2}$$
    
    The significance is then determined using:
    $$sig. = 1 - \\chi^2\\left(\\chi_F^2, df\\right)$$
    
    A normal distribution approximation was proposed by Friedman (1937, p. 695; 1939, p. 109):
    $$z_F = \\frac{\\chi_F^2-\\left(k-1\\right)}{\\sqrt{2\\times\\frac{n-1}{n}\\times\\left(k-1\\right)}}$$
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|z_F\\right|\\right)\\right)$$
    
    And an F distribution by Iman and Davenport (1980, p. 573):
    $$F_F = \\frac{\\left(n-1\\right)\\times\\chi_F^2}{n\\times\\left(k-1\\right)-\\chi_F^2}$$
    $$df_1 = k - 1$$
    $$df_2 = \\left(k-1\\right)\\times\\left(n - 1\\right)$$
    $$sig. = 1 - F\\left(F_{F}, df_1, df_2\\right)$$
    
    Some might refer to Conover for this F-distribution, but in Conover (1980, p. 300) it seems Conover credits Iman and Davenport himself. 
    
    *Symbols Used*
    
    * \\(n\\), the number of cases
    * \\(k\\), the number of variables
    * \\(r_{i,j}\\), the rank of case i, in variable j. The ranks are determined for each case.
    * \\(t_{i,j}\\), the frequency of unique rank j, in case i. For each row the frequencies of each rank is determined in the calculations.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley.
    
    Friedman, M. (1937). The use of ranks to avoid the assumption of normality implicit in the analysis of variance. *Journal of the American Statistical Association, 32*(200), 675–701. doi:10.2307/2279372
    
    Friedman, M. (1939). A correction. *Journal of the American Statistical Association, 34*(205), 109–109. doi:10.1080/01621459.1939.10502372
    
    Hollander, M., & Wolfe, D. A. (1999). *Nonparametric statistical methods* (2nd ed.). Wiley.
    
    IBM. (2021). IBM SPSS Statistics Algorithms. IBM.
    
    Iman, R., & Davenport, J. (1980). Approximations of the critical region of the Friedman statistic. *Communications in Statistics-Theory and Methods, 9*, 571–595.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data = data.reset_index(drop=True)        
        
    n = len(data)
    k = len(data.columns)
    
    if levels is not None:
        data = data.replace(levels)
    
    ranks = pd.DataFrame()
    for i in range(0, n):
        rankRow = pd.Series(rankdata(data.iloc[i, :]))
        ranks[i] = rankRow
    
    ranks = ranks.transpose()
    
    rs = ranks.sum().sum()
    rm = rs/(n*k)
    
    #Determine for each variable the average rank, and
    #the squared difference of this average with the overall average rank.
    rmj = ranks.sum()/n
    rs2 = ((rmj*n)**2).sum()
    sst = n*((rmj - rm)**2).sum()
    sse = ranks.stack().var(ddof=0)*k/(k-1)
    
    if ties:
        # Hollander and Wolfe (1999, p. 274) rewritten:
        qadj = sst / sse
    else:
        # Friedman (1937, p. 679):
        qadj = 12 / (n * k * (k + 1)) * rs2 - 3 * n * (k + 1)
    
    df = k - 1
    
    if dist=="f":
        # Iman-Davenport F Distribution (1980, p. 573)
        fVal = (n - 1) * qadj / (n * (k - 1) - qadj)
        df1 = df
        df2 = (k - 1) * (n - 1)
        p = f.sf(fVal, df1, df2)
        
        res = pd.DataFrame([[n, fVal, df1, df2, p]])
        res.columns = ["n", "statistic", "df1", "df2", "p-value"]
    
    elif dist=="normal":        
        # Friedman Normal Distribution (1937, p. 695; 1939, p. 109):
        z = (qadj - (k - 1)) / (2 * (n - 1) / n * (k - 1))**0.5
        p = 2 * (1 - NormalDist().cdf(abs(z))) 
        res = pd.DataFrame([[n, z, p]])
        res.columns = ["n", "statistic", "p-value"]
        
    else:
        #Friedman Chi-Square Distribution
        p = chi2.sf(qadj, df)
        res = pd.DataFrame([[n, qadj, df, p]])
        res.columns = ["n", "statistic", "df", "p-value"]
    
    return res