Module `stikpetP.tests.test_kruskal_wallis`

Expand source code

import pandas as pd
from scipy.stats import chi2
from scipy.stats import gamma
from scipy.stats import f
from scipy.stats import beta
from ..other.table_cross import tab_cross

def ts_kruskal_wallis(catField, ordField, categories=None, levels=None,  method = "chi2", tiescorr = True):
    '''
    Kruskal-Wallis H Test
    ---------------------
    This test is an extension of the Mann-Whitney U test (see ts_mann_whitney()) to more than two categories. It is also seen as the non-parametric version of the one-way ANOVA (see ts_fisher_owa()).
    
    The test can indicate if any of the scores in one or more categories, has a significant different mean rank than one or more of the other categories. More strickly the null hypothesis is that the probability of a randomly selected case having a score greater than a random score from the other category is 50% (Divine et al., p. 286).
    
    Alternative there is a Mood Median test (see ts_mood_median()).
    
    To pin-point which category or categories differ significantly, a post-hoc analysis could be used. 
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
    method : {"chi2", "kw-gamma", "kw-gamma-chi2", "kw-beta", "kw-beta-f", "wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3", "ids"}, optional
        the specific variation of the test to use. Default is "chi2".
    tiescorr : boolean, optional
        use of a ties correction. Default is True.
        
    Returns
    -------
    Dataframe with:
    
    * *n*, the sample size
    * *h*, the H-statistic
    * other test results, depending on test used.
    
    Notes
    -----
    
    **The H value**
    
    The formula used is (Kruskal & Wallis, 1952, p. 586):
    $$H = \\frac{12}{n\\times\\left(n + 1\\right)}\\times\\sum_{i=1}^k \\frac{R_i^2}{n_i}-3\\times\\left(n+1\\right)$$
    With:
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    The ties correction (Kruskal & Wallis, 1952, p. 586):
    $$H_{adj} = \\frac{H}{1-\\frac{\\sum T}{n^3-n}}$$
    With:
    $$T_j = t_j^3 - t_j$$
    
    Or alternatively:
    $$H_{adj} = \\left(n - 1\\right)\\times\\frac{\\sum_{i=1}^k n_i\\left(\\bar{r}_i - \\bar{r}\\right)^2}{\\sum_{i=1}^k \\sum_{j=1}^{n_i}\\left(r_{i,j}-\\bar{r}\\right)^2}$$
    With:
    $$\\bar{r}_i = \\frac{R_i}{n_i}$$
    $$\\bar{r} = \\frac{\\sum_{i=1}^k \\bar{r}_i}{\\sum_{i=1}^k n_i}$$
    
    **The Test**
    
    *"chi2", Kruskal-Wallis Chi-Square Approximation*    
    $$sig. \\approx 1 - \\chi^2\\left(H, df\\right)$$
    $$df = k - 1$$
    
    *"kw-gamma", Kruskal-Wallis incomplete gamma approximation* (Kruskal & Wallis, 1952, p. 609)
    $$sig. \\approx 1 - \\gamma\\left(H, \\alpha, \\beta\\right)$$
    $$\\alpha = \\frac{\\mu^2}{\\sigma^2}$$
    $$\\beta = \\frac{\\sigma^2}{\\mu}$$
    $$\\mu = k -1$$
    $$\\sigma^2 = 2\\times\\left(k - 1\\right) - \\frac{2\\times\\left(k\\times k^2 - 6\\times k + n\\times\\left(2\\times k^2-6\\times k+1\\right)\\right)}{5\\times n\\times\\left(n + 1\\right)} - \\frac{6}{5}\\times\\sum_{i=1}^k \\frac{1}{n_i}$$
    
    *"kw-gamma-chi2", Kruskal-Wallis Chi-square approximation of gamma approximation*
    $$sig. = 1 - \\chi^2\\left(\\chi_a^2, df\\right)$$
    $$\\chi_a^2 = \\frac{2\\times\\mu}{\\sigma^2}\\times H$$
    $$df = 2\\times\\frac{\\mu^2}{\\sigma^2}$$
    
    *"kw-beta", Kruskal-Wallis incomplete Beta distribution approximation* (Kruskal & Wallis, 1952, p. 609)
    $$sig. = 1 - \\beta\\left(\\frac{H}{M}, \\alpha, \\beta\\right)$$
    $$M = \\frac{n^3 - \\sum_{i=1}^k n_i^3}{n\\times\\left(n + 1\\right)}$$
    $$\\alpha = df_1\\times\\frac{1}{2}$$
    $$\\beta = df_2\\times\\frac{1}{2}$$
    $$df_1 = \\mu\\times\\frac{\\mu\\times\\left(M-\\mu\\right)-\\sigma^2}{\\frac{1}{2}\\times M\\times\\sigma^2}$$
    $$df_2 = df_1\\times\\frac{M-\\mu}{\\mu}$$
    
    *"kw-beta-f", F-approximation of the Kruskal-Wallis incomplete Beta distribution approximation* (Kruskal & Wallis, 1952, p. 610)
    $$sig. \\approx 1 - F\\left(F_{\\alpha}, df_1, df_2\\right)$$
    $$F_{\\alpha} = \\frac{H\\times\\left(M - \\mu\\right)}{\\mu\\times\\left(M - H\\right)}$$
    
    *Wallace F distribution approximations* (Wallace, 1959, p. 226)
    $$sig. = 1 - F\\left(F_2, df_1^i, df_2^i\\right)$$
    With:
    $$F_2 = \\frac{\\left(n - k\\right)\\times H}{\\left(k - 1\\right)\\times\\left(n - 1 - H\\right)}$$
    $$df_1^i = \\left(k - 1\\right)\\times d_i$$
    $$df_2^i = \\left(n - k\\right)\\times d_i$$
    
    *Wallace Beta distribution approximations* (Wallace, 1959, p. 226)
    $$sig. \\approx 1 - \\beta\\left(B_2, \\alpha, \\beta\\right)$$
    $$B_2 = \\frac{H}{n-1}$$
    $$\\alpha = df_1\\times\\frac{1}{2}$$
    $$\\beta = df_2\\times\\frac{1}{2}$$
    
    
    *"wallace-f1"* and *"wallace-b1"*
    $$d_i = \\frac{\\left(n - k\\right)\\times\\left(k - 1\\right)-\\sigma^2}{\\frac{1}{2}\\times\\left(n - 1\\right)\\times \\sigma^2}$$
    
    *"wallace-f2"* and *"wallace-b2"*
    $$d_i = 1 - \\frac{6\\times\\left(n + 1\\right)}{5\\times\\left(n - 1\\right)\\times\\left(n+1.2\\right)}$$
    
    *"wallace-f3"* and *"wallace-b3"*
    $$d_i = 1$$
    
    *"ids", Iman-Davenport Satterwaite approximation* (Iman & Davenport, 1976, p. 1338)
    $$sig. = 1 - F\\left(F_2, df_1, df_2\\right)$$
    With:
    $$df_1 = k - 1$$
    $$df_2 = \\frac{\\left(\\sum_{i=1}^k\\left(n_i-1\\right)\\times v_i\\right)^2}{\\sum_{i=1}^k\\frac{\\left(\\left(n_i-1\\right)\\times v_i\\right)^2}{n_i-1}}$$
    $$v_i = \\frac{\\sum_{j=1}^{n_i}\\left(r_{i,j}-\\bar{r}_i\\right)^2}{n_i-1}$$
    $$\\bar{r}_i = \\frac{\\sum_{j=1}^{n_i} r_{i,j}}{n_i}$$
    
    *Symbols used:*
    
    * \\(k\\), the number of categories
    * \\(t_j\\), the frequency of the j-th unique rank.
    * \\(n\\), the total sample size
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i
    * \\(\\bar{r}\\), the average of all ranks
    * \\(\\chi^2\\left(\\dots\\right)\\), the cumulative distribution function of the chi-square distribution.
    * \\(F\\left(\\dots\\right)\\), the cumulative distribution function of the F distribution.
    * \\(\\beta\\left(\\dots\\right)\\), the cumulative distribution function of the beta distribution.
    
    References
    ----------
    Iman, R. L., & Davenport, J. M. (1976). New approximations to the exact distribution of the kruskal-wallis test statistic. *Communications in Statistics - Theory and Methods, 5*(14), 1335–1348. doi:10.1080/03610927608827446
    
    Kruskal, W. H., & Wallis, W. A. (1952). Use of ranks in one-criterion variance analysis. *Journal of the American Statistical Association, 47*(260), 583–621. doi:10.1080/01621459.1952.10483441
    
    Wallace, D. L. (1959). Simplified beta-approximations to the Kruskal-Wallis H test. *Journal of the American Statistical Association, 54*(285), 225. doi:10.2307/2282148

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    # "chi2", "kw-gamma", "kw-gamma-chi2", "kw-beta", "kw-beta-f", "wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3", "ids"
    #Excel results differ if df is non-integer
    
    #create the cross table
    ct = tab_cross(ordField, catField, levels, categories, totals="include")    
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.DataFrame()
    cf = 0
    for i in range(0, nlvl):
        lvlRank.at[i,0] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k] 
    
    #sum of ranks per category
    srj = pd.DataFrame()
    n = 0
    for j in range(0, k):
        sr = 0
        for i in range(0, nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i,0]
        srj.at[j,0] = sr
        n = n + ct.iloc[nlvl, j]

    #H
    h = 0
    for i in range(0, k):
        h = h + srj.iloc[i,0]**2 / ct.iloc[nlvl, i]
    h = 12 / (n * (n + 1)) * h - 3 * (n + 1)
    
    if tiescorr:
        #ties
        t = 0
        for i in range(0, nlvl):
            tf = ct.iloc[i, k]
            t = t + tf**3 - tf
        
        #Hadj
        h = h / (1 - t / (n**3 - n))
    
    Var = 0
    for i in range(0, k):
        Var = Var + 1 / ct.iloc[nlvl, i]
    Var = 2 * (k - 1) - (2 * (3 * k**2 - 6 * k + n * (2 * k**2 - 6 * k + 1))) / (5 * n * (n + 1)) - 6 / 5 * Var
    
    mu = k - 1
    
    res = pd.DataFrame()
    if method=="chi2":
        #kw chi2 appr
        df = mu
        p = chi2.sf(h, df)
        
        colnames = ["n", "h", "df", "p-value"]        
        res = pd.DataFrame([[n, h, df, p]], columns=colnames)
    
    elif method == "kw-gamma":
        #kw gamma appr
        bet = Var / mu
        alp = mu**2 / Var
        p = 2 * (1 - gamma.cdf(h, alp, scale=bet))
        
        colnames = ["n", "h", "alpha", "beta", "p-value"]
        res = pd.DataFrame([[n, h, alp, bet, p]], columns=colnames)
    
    elif method == "kw-gamma-chi2":
        #kw gamma chi appr
        chi2Val = 2 * mu / Var * h
        df = 2 * mu**2 / Var
        p = chi2.sf(chi2Val, df)
        
        colnames = ["n", "h", "chi2 statistic", "df", "p-value"]        
        res = pd.DataFrame([[n, h, chi2Val, df, p]], columns=colnames)
    
    elif method == "kw-beta" or method == "kw-beta-f" :
        #kw beta appr and
        m = 0
        for i in range(0, k):
            m = m + ct.iloc[nlvl, i]**3        
        m = (n**3 - m) / (n * (n + 1))
        df1 = mu * (mu * (m - mu) - Var) / (0.5 * m * Var)
        df2 = df1 * (m - mu) / mu
        
        if method == "kw-beta":
            #kw beta appr
            alp = df1 * 0.5
            bet = df2 * 0.5
            p = 1 - gamma.cdf(h / m, alp, scale=1/bet)
            
            colnames = ["n", "h", "B2", "alpha", "beta", "p-value"]
            res = pd.DataFrame([[n, h, h/m, alp, bet, p]], columns=colnames)
        else:
            #kw beta F appr
            fVal = h * (m - mu) / (mu * (m - h))
            p = f.sf(fVal, df1, df2)
            
            colnames = ["n", "h", "F statistic", "df1", "df2", "p-value"]
            res = pd.DataFrame([[n, h, fVal, df1, df2, p]], columns=colnames)
    else:
        if method == "wallace-beta1" or method == "wallace-f1":
            d = ((n - k) * (k - 1) - Var) / (0.5 * (n - 1)*Var)
        elif method == "wallace-beta2" or method == "wallace-f2":
            d = 1 - 6 * (n + 1) / (5 * (n - 1) * (n + 1.2))
        elif method == "wallace-beta3" or method == "wallace-f3":
            d = 1
        if method in ["wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3"]:
            df1 = (k - 1) * d
            df2 = (n - k) * d
        
        if method in ["wallace-f1", "wallace-f2", "wallace-f3"]:
            fw = (n - k) * h / ((k - 1) * (n - 1 - h))
            p = f.sf(fw, df1, df2)
            
            colnames = ["n", "h", "F statistic", "df1", "df2", "p-value"]
            res = pd.DataFrame([[n, h, fw, df1, df2, p]], columns=colnames)
            
        elif method in ["wallace-beta1", "wallace-beta2", "wallace-beta3"]:
            b = h / (n - 1)
            alp = df1 * 0.5
            bet = df2 * 0.5
            
            p = beta.sf(b, alp, bet)
            
            colnames = ["n", "h", "B2", "alpha", "beta", "p-value"]
            res = pd.DataFrame([[n, h, b, alp, bet, p]], columns=colnames)
        
        elif method == "ids":
            fw = (n - k) * h / ((k - 1) * (n - 1 - h))                        
            rm = pd.DataFrame()
            for j in range(0, k):
                rm.at[j,0] = srj.iloc[j,0] / ct.iloc[nlvl, j]
            
            v = pd.DataFrame()
            for j in range(0, k):
                vval = 0
                for i in range(0, nlvl):
                    vval = vval + (lvlRank.iloc[i,0] - rm.iloc[j,0])**2                
                vval = vval / ct.iloc[nlvl, j]
                v.at[j,0] = vval
            
            num = 0
            den = 0
            for j in range(0, k):
                num = num + (ct.iloc[nlvl, j] - 1) * v.iloc[j,0]
                den = den + ((ct.iloc[nlvl, j] - 1) * v.iloc[j,0])**2 / (ct.iloc[nlvl, j] - 1)
            
            df2 = num**2 / den
            df1 = k - 1
            p = f.sf(fw, df1, df2)
            
            colnames = ["n", "h", "F statistic", "df1", "df2", "p-value"]
            res = pd.DataFrame([[n, h, fw, df1, df2, p]], columns=colnames)
        
    
    return res

Functions

def ts_kruskal_wallis(catField, ordField, categories=None, levels=None, method='chi2', tiescorr=True)

Kruskal-Wallis H Test

This test is an extension of the Mann-Whitney U test (see ts_mann_whitney()) to more than two categories. It is also seen as the non-parametric version of the one-way ANOVA (see ts_fisher_owa()).

The test can indicate if any of the scores in one or more categories, has a significant different mean rank than one or more of the other categories. More strickly the null hypothesis is that the probability of a randomly selected case having a score greater than a random score from the other category is 50% (Divine et al., p. 286).

Alternative there is a Mood Median test (see ts_mood_median()).

To pin-point which category or categories differ significantly, a post-hoc analysis could be used.

Parameters

catField : pandas series: data with categories
ordField : pandas series: data with the scores
categories : list or dictionary, optional: the categories to use from catField
levels : list or dictionary, optional: the levels or order used in ordField.
method : {"chi2", "kw-gamma", "kw-gamma-chi2", "kw-beta", "kw-beta-f", "wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3", "ids"}, optional: the specific variation of the test to use. Default is "chi2".
tiescorr : boolean, optional: use of a ties correction. Default is True.

Returns

Dataframe with:

n, the sample size
h, the H-statistic
other test results, depending on test used.

Notes

The H value

The formula used is (Kruskal & Wallis, 1952, p. 586): $H = \frac{12}{n\times\left(n + 1\right)}\times\sum_{i=1}^k \frac{R_i^2}{n_i}-3\times\left(n+1\right)$ With: $R_i = \sum_{j=1}^{n_i} r_{i,j}$

The ties correction (Kruskal & Wallis, 1952, p. 586): $H_{adj} = \frac{H}{1-\frac{\sum T}{n^3-n}}$ With: $T_j = t_j^3 - t_j$

Or alternatively: $H_{adj} = \left(n - 1\right)\times\frac{\sum_{i=1}^k n_i\left(\bar{r}_i - \bar{r}\right)^2}{\sum_{i=1}^k \sum_{j=1}^{n_i}\left(r_{i,j}-\bar{r}\right)^2}$ With: $\bar{r}_i = \frac{R_i}{n_i}$ $\bar{r} = \frac{\sum_{i=1}^k \bar{r}_i}{\sum_{i=1}^k n_i}$

The Test

"chi2", Kruskal-Wallis Chi-Square Approximation
$sig. \approx 1 - \chi^2\left(H, df\right)$ $df = k - 1$

"kw-gamma", Kruskal-Wallis incomplete gamma approximation (Kruskal & Wallis, 1952, p. 609) $sig. \approx 1 - \gamma\left(H, \alpha, \beta\right)$ $\alpha = \frac{\mu^2}{\sigma^2}$ $\beta = \frac{\sigma^2}{\mu}$ $\mu = k -1$ $\sigma^2 = 2\times\left(k - 1\right) - \frac{2\times\left(k\times k^2 - 6\times k + n\times\left(2\times k^2-6\times k+1\right)\right)}{5\times n\times\left(n + 1\right)} - \frac{6}{5}\times\sum_{i=1}^k \frac{1}{n_i}$

"kw-gamma-chi2", Kruskal-Wallis Chi-square approximation of gamma approximation $sig. = 1 - \chi^2\left(\chi_a^2, df\right)$ $\chi_a^2 = \frac{2\times\mu}{\sigma^2}\times H$ $df = 2\times\frac{\mu^2}{\sigma^2}$

"kw-beta", Kruskal-Wallis incomplete Beta distribution approximation (Kruskal & Wallis, 1952, p. 609) $sig. = 1 - \beta\left(\frac{H}{M}, \alpha, \beta\right)$ $M = \frac{n^3 - \sum_{i=1}^k n_i^3}{n\times\left(n + 1\right)}$ $\alpha = df_1\times\frac{1}{2}$ $\beta = df_2\times\frac{1}{2}$ $df_1 = \mu\times\frac{\mu\times\left(M-\mu\right)-\sigma^2}{\frac{1}{2}\times M\times\sigma^2}$ $df_2 = df_1\times\frac{M-\mu}{\mu}$

"kw-beta-f", F-approximation of the Kruskal-Wallis incomplete Beta distribution approximation (Kruskal & Wallis, 1952, p. 610) $sig. \approx 1 - F\left(F_{\alpha}, df_1, df_2\right)$ $F_{\alpha} = \frac{H\times\left(M - \mu\right)}{\mu\times\left(M - H\right)}$

Wallace F distribution approximations (Wallace, 1959, p. 226) $sig. = 1 - F\left(F_2, df_1^i, df_2^i\right)$ With: $F_2 = \frac{\left(n - k\right)\times H}{\left(k - 1\right)\times\left(n - 1 - H\right)}$ $df_1^i = \left(k - 1\right)\times d_i$ $df_2^i = \left(n - k\right)\times d_i$

Wallace Beta distribution approximations (Wallace, 1959, p. 226) $sig. \approx 1 - \beta\left(B_2, \alpha, \beta\right)$ $B_2 = \frac{H}{n-1}$ $\alpha = df_1\times\frac{1}{2}$ $\beta = df_2\times\frac{1}{2}$

"wallace-f1" and "wallace-b1" $d_i = \frac{\left(n - k\right)\times\left(k - 1\right)-\sigma^2}{\frac{1}{2}\times\left(n - 1\right)\times \sigma^2}$

"wallace-f2" and "wallace-b2" $d_i = 1 - \frac{6\times\left(n + 1\right)}{5\times\left(n - 1\right)\times\left(n+1.2\right)}$

"wallace-f3" and "wallace-b3" $d_i = 1$

"ids", Iman-Davenport Satterwaite approximation (Iman & Davenport, 1976, p. 1338) $sig. = 1 - F\left(F_2, df_1, df_2\right)$ With: $df_1 = k - 1$ $df_2 = \frac{\left(\sum_{i=1}^k\left(n_i-1\right)\times v_i\right)^2}{\sum_{i=1}^k\frac{\left(\left(n_i-1\right)\times v_i\right)^2}{n_i-1}}$ $v_i = \frac{\sum_{j=1}^{n_i}\left(r_{i,j}-\bar{r}_i\right)^2}{n_i-1}$ $\bar{r}_i = \frac{\sum_{j=1}^{n_i} r_{i,j}}{n_i}$

Symbols used:

$k$ , the number of categories
$t_j$ , the frequency of the j-th unique rank.
$n$ , the total sample size
$n_i$ , the number of scores in category i
$r_{i,j}$ , the rank of the j-th score in category i
$R_i$ , the sum of the ranks in category i
$\bar{r}_i$ , the average of the ranks in category i
$\bar{r}$ , the average of all ranks
$\chi^2\left(\dots\right)$ , the cumulative distribution function of the chi-square distribution.
$F\left(\dots\right)$ , the cumulative distribution function of the F distribution.
$\beta\left(\dots\right)$ , the cumulative distribution function of the beta distribution.

References

Iman, R. L., & Davenport, J. M. (1976). New approximations to the exact distribution of the kruskal-wallis test statistic. Communications in Statistics - Theory and Methods, 5(14), 1335–1348. doi:10.1080/03610927608827446

Kruskal, W. H., & Wallis, W. A. (1952). Use of ranks in one-criterion variance analysis. Journal of the American Statistical Association, 47(260), 583–621. doi:10.1080/01621459.1952.10483441

Wallace, D. L. (1959). Simplified beta-approximations to the Kruskal-Wallis H test. Journal of the American Statistical Association, 54(285), 225. doi:10.2307/2282148

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def ts_kruskal_wallis(catField, ordField, categories=None, levels=None,  method = "chi2", tiescorr = True):
    '''
    Kruskal-Wallis H Test
    ---------------------
    This test is an extension of the Mann-Whitney U test (see ts_mann_whitney()) to more than two categories. It is also seen as the non-parametric version of the one-way ANOVA (see ts_fisher_owa()).
    
    The test can indicate if any of the scores in one or more categories, has a significant different mean rank than one or more of the other categories. More strickly the null hypothesis is that the probability of a randomly selected case having a score greater than a random score from the other category is 50% (Divine et al., p. 286).
    
    Alternative there is a Mood Median test (see ts_mood_median()).
    
    To pin-point which category or categories differ significantly, a post-hoc analysis could be used. 
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
    method : {"chi2", "kw-gamma", "kw-gamma-chi2", "kw-beta", "kw-beta-f", "wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3", "ids"}, optional
        the specific variation of the test to use. Default is "chi2".
    tiescorr : boolean, optional
        use of a ties correction. Default is True.
        
    Returns
    -------
    Dataframe with:
    
    * *n*, the sample size
    * *h*, the H-statistic
    * other test results, depending on test used.
    
    Notes
    -----
    
    **The H value**
    
    The formula used is (Kruskal & Wallis, 1952, p. 586):
    $$H = \\frac{12}{n\\times\\left(n + 1\\right)}\\times\\sum_{i=1}^k \\frac{R_i^2}{n_i}-3\\times\\left(n+1\\right)$$
    With:
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    The ties correction (Kruskal & Wallis, 1952, p. 586):
    $$H_{adj} = \\frac{H}{1-\\frac{\\sum T}{n^3-n}}$$
    With:
    $$T_j = t_j^3 - t_j$$
    
    Or alternatively:
    $$H_{adj} = \\left(n - 1\\right)\\times\\frac{\\sum_{i=1}^k n_i\\left(\\bar{r}_i - \\bar{r}\\right)^2}{\\sum_{i=1}^k \\sum_{j=1}^{n_i}\\left(r_{i,j}-\\bar{r}\\right)^2}$$
    With:
    $$\\bar{r}_i = \\frac{R_i}{n_i}$$
    $$\\bar{r} = \\frac{\\sum_{i=1}^k \\bar{r}_i}{\\sum_{i=1}^k n_i}$$
    
    **The Test**
    
    *"chi2", Kruskal-Wallis Chi-Square Approximation*    
    $$sig. \\approx 1 - \\chi^2\\left(H, df\\right)$$
    $$df = k - 1$$
    
    *"kw-gamma", Kruskal-Wallis incomplete gamma approximation* (Kruskal & Wallis, 1952, p. 609)
    $$sig. \\approx 1 - \\gamma\\left(H, \\alpha, \\beta\\right)$$
    $$\\alpha = \\frac{\\mu^2}{\\sigma^2}$$
    $$\\beta = \\frac{\\sigma^2}{\\mu}$$
    $$\\mu = k -1$$
    $$\\sigma^2 = 2\\times\\left(k - 1\\right) - \\frac{2\\times\\left(k\\times k^2 - 6\\times k + n\\times\\left(2\\times k^2-6\\times k+1\\right)\\right)}{5\\times n\\times\\left(n + 1\\right)} - \\frac{6}{5}\\times\\sum_{i=1}^k \\frac{1}{n_i}$$
    
    *"kw-gamma-chi2", Kruskal-Wallis Chi-square approximation of gamma approximation*
    $$sig. = 1 - \\chi^2\\left(\\chi_a^2, df\\right)$$
    $$\\chi_a^2 = \\frac{2\\times\\mu}{\\sigma^2}\\times H$$
    $$df = 2\\times\\frac{\\mu^2}{\\sigma^2}$$
    
    *"kw-beta", Kruskal-Wallis incomplete Beta distribution approximation* (Kruskal & Wallis, 1952, p. 609)
    $$sig. = 1 - \\beta\\left(\\frac{H}{M}, \\alpha, \\beta\\right)$$
    $$M = \\frac{n^3 - \\sum_{i=1}^k n_i^3}{n\\times\\left(n + 1\\right)}$$
    $$\\alpha = df_1\\times\\frac{1}{2}$$
    $$\\beta = df_2\\times\\frac{1}{2}$$
    $$df_1 = \\mu\\times\\frac{\\mu\\times\\left(M-\\mu\\right)-\\sigma^2}{\\frac{1}{2}\\times M\\times\\sigma^2}$$
    $$df_2 = df_1\\times\\frac{M-\\mu}{\\mu}$$
    
    *"kw-beta-f", F-approximation of the Kruskal-Wallis incomplete Beta distribution approximation* (Kruskal & Wallis, 1952, p. 610)
    $$sig. \\approx 1 - F\\left(F_{\\alpha}, df_1, df_2\\right)$$
    $$F_{\\alpha} = \\frac{H\\times\\left(M - \\mu\\right)}{\\mu\\times\\left(M - H\\right)}$$
    
    *Wallace F distribution approximations* (Wallace, 1959, p. 226)
    $$sig. = 1 - F\\left(F_2, df_1^i, df_2^i\\right)$$
    With:
    $$F_2 = \\frac{\\left(n - k\\right)\\times H}{\\left(k - 1\\right)\\times\\left(n - 1 - H\\right)}$$
    $$df_1^i = \\left(k - 1\\right)\\times d_i$$
    $$df_2^i = \\left(n - k\\right)\\times d_i$$
    
    *Wallace Beta distribution approximations* (Wallace, 1959, p. 226)
    $$sig. \\approx 1 - \\beta\\left(B_2, \\alpha, \\beta\\right)$$
    $$B_2 = \\frac{H}{n-1}$$
    $$\\alpha = df_1\\times\\frac{1}{2}$$
    $$\\beta = df_2\\times\\frac{1}{2}$$
    
    
    *"wallace-f1"* and *"wallace-b1"*
    $$d_i = \\frac{\\left(n - k\\right)\\times\\left(k - 1\\right)-\\sigma^2}{\\frac{1}{2}\\times\\left(n - 1\\right)\\times \\sigma^2}$$
    
    *"wallace-f2"* and *"wallace-b2"*
    $$d_i = 1 - \\frac{6\\times\\left(n + 1\\right)}{5\\times\\left(n - 1\\right)\\times\\left(n+1.2\\right)}$$
    
    *"wallace-f3"* and *"wallace-b3"*
    $$d_i = 1$$
    
    *"ids", Iman-Davenport Satterwaite approximation* (Iman & Davenport, 1976, p. 1338)
    $$sig. = 1 - F\\left(F_2, df_1, df_2\\right)$$
    With:
    $$df_1 = k - 1$$
    $$df_2 = \\frac{\\left(\\sum_{i=1}^k\\left(n_i-1\\right)\\times v_i\\right)^2}{\\sum_{i=1}^k\\frac{\\left(\\left(n_i-1\\right)\\times v_i\\right)^2}{n_i-1}}$$
    $$v_i = \\frac{\\sum_{j=1}^{n_i}\\left(r_{i,j}-\\bar{r}_i\\right)^2}{n_i-1}$$
    $$\\bar{r}_i = \\frac{\\sum_{j=1}^{n_i} r_{i,j}}{n_i}$$
    
    *Symbols used:*
    
    * \\(k\\), the number of categories
    * \\(t_j\\), the frequency of the j-th unique rank.
    * \\(n\\), the total sample size
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i
    * \\(\\bar{r}\\), the average of all ranks
    * \\(\\chi^2\\left(\\dots\\right)\\), the cumulative distribution function of the chi-square distribution.
    * \\(F\\left(\\dots\\right)\\), the cumulative distribution function of the F distribution.
    * \\(\\beta\\left(\\dots\\right)\\), the cumulative distribution function of the beta distribution.
    
    References
    ----------
    Iman, R. L., & Davenport, J. M. (1976). New approximations to the exact distribution of the kruskal-wallis test statistic. *Communications in Statistics - Theory and Methods, 5*(14), 1335–1348. doi:10.1080/03610927608827446
    
    Kruskal, W. H., & Wallis, W. A. (1952). Use of ranks in one-criterion variance analysis. *Journal of the American Statistical Association, 47*(260), 583–621. doi:10.1080/01621459.1952.10483441
    
    Wallace, D. L. (1959). Simplified beta-approximations to the Kruskal-Wallis H test. *Journal of the American Statistical Association, 54*(285), 225. doi:10.2307/2282148

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    # "chi2", "kw-gamma", "kw-gamma-chi2", "kw-beta", "kw-beta-f", "wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3", "ids"
    #Excel results differ if df is non-integer
    
    #create the cross table
    ct = tab_cross(ordField, catField, levels, categories, totals="include")    
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.DataFrame()
    cf = 0
    for i in range(0, nlvl):
        lvlRank.at[i,0] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k] 
    
    #sum of ranks per category
    srj = pd.DataFrame()
    n = 0
    for j in range(0, k):
        sr = 0
        for i in range(0, nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i,0]
        srj.at[j,0] = sr
        n = n + ct.iloc[nlvl, j]

    #H
    h = 0
    for i in range(0, k):
        h = h + srj.iloc[i,0]**2 / ct.iloc[nlvl, i]
    h = 12 / (n * (n + 1)) * h - 3 * (n + 1)
    
    if tiescorr:
        #ties
        t = 0
        for i in range(0, nlvl):
            tf = ct.iloc[i, k]
            t = t + tf**3 - tf
        
        #Hadj
        h = h / (1 - t / (n**3 - n))
    
    Var = 0
    for i in range(0, k):
        Var = Var + 1 / ct.iloc[nlvl, i]
    Var = 2 * (k - 1) - (2 * (3 * k**2 - 6 * k + n * (2 * k**2 - 6 * k + 1))) / (5 * n * (n + 1)) - 6 / 5 * Var
    
    mu = k - 1
    
    res = pd.DataFrame()
    if method=="chi2":
        #kw chi2 appr
        df = mu
        p = chi2.sf(h, df)
        
        colnames = ["n", "h", "df", "p-value"]        
        res = pd.DataFrame([[n, h, df, p]], columns=colnames)
    
    elif method == "kw-gamma":
        #kw gamma appr
        bet = Var / mu
        alp = mu**2 / Var
        p = 2 * (1 - gamma.cdf(h, alp, scale=bet))
        
        colnames = ["n", "h", "alpha", "beta", "p-value"]
        res = pd.DataFrame([[n, h, alp, bet, p]], columns=colnames)
    
    elif method == "kw-gamma-chi2":
        #kw gamma chi appr
        chi2Val = 2 * mu / Var * h
        df = 2 * mu**2 / Var
        p = chi2.sf(chi2Val, df)
        
        colnames = ["n", "h", "chi2 statistic", "df", "p-value"]        
        res = pd.DataFrame([[n, h, chi2Val, df, p]], columns=colnames)
    
    elif method == "kw-beta" or method == "kw-beta-f" :
        #kw beta appr and
        m = 0
        for i in range(0, k):
            m = m + ct.iloc[nlvl, i]**3        
        m = (n**3 - m) / (n * (n + 1))
        df1 = mu * (mu * (m - mu) - Var) / (0.5 * m * Var)
        df2 = df1 * (m - mu) / mu
        
        if method == "kw-beta":
            #kw beta appr
            alp = df1 * 0.5
            bet = df2 * 0.5
            p = 1 - gamma.cdf(h / m, alp, scale=1/bet)
            
            colnames = ["n", "h", "B2", "alpha", "beta", "p-value"]
            res = pd.DataFrame([[n, h, h/m, alp, bet, p]], columns=colnames)
        else:
            #kw beta F appr
            fVal = h * (m - mu) / (mu * (m - h))
            p = f.sf(fVal, df1, df2)
            
            colnames = ["n", "h", "F statistic", "df1", "df2", "p-value"]
            res = pd.DataFrame([[n, h, fVal, df1, df2, p]], columns=colnames)
    else:
        if method == "wallace-beta1" or method == "wallace-f1":
            d = ((n - k) * (k - 1) - Var) / (0.5 * (n - 1)*Var)
        elif method == "wallace-beta2" or method == "wallace-f2":
            d = 1 - 6 * (n + 1) / (5 * (n - 1) * (n + 1.2))
        elif method == "wallace-beta3" or method == "wallace-f3":
            d = 1
        if method in ["wallace-f1", "wallace-f2", "wallace-f3", "wallace-beta1", "wallace-beta2", "wallace-beta3"]:
            df1 = (k - 1) * d
            df2 = (n - k) * d
        
        if method in ["wallace-f1", "wallace-f2", "wallace-f3"]:
            fw = (n - k) * h / ((k - 1) * (n - 1 - h))
            p = f.sf(fw, df1, df2)
            
            colnames = ["n", "h", "F statistic", "df1", "df2", "p-value"]
            res = pd.DataFrame([[n, h, fw, df1, df2, p]], columns=colnames)
            
        elif method in ["wallace-beta1", "wallace-beta2", "wallace-beta3"]:
            b = h / (n - 1)
            alp = df1 * 0.5
            bet = df2 * 0.5
            
            p = beta.sf(b, alp, bet)
            
            colnames = ["n", "h", "B2", "alpha", "beta", "p-value"]
            res = pd.DataFrame([[n, h, b, alp, bet, p]], columns=colnames)
        
        elif method == "ids":
            fw = (n - k) * h / ((k - 1) * (n - 1 - h))                        
            rm = pd.DataFrame()
            for j in range(0, k):
                rm.at[j,0] = srj.iloc[j,0] / ct.iloc[nlvl, j]
            
            v = pd.DataFrame()
            for j in range(0, k):
                vval = 0
                for i in range(0, nlvl):
                    vval = vval + (lvlRank.iloc[i,0] - rm.iloc[j,0])**2                
                vval = vval / ct.iloc[nlvl, j]
                v.at[j,0] = vval
            
            num = 0
            den = 0
            for j in range(0, k):
                num = num + (ct.iloc[nlvl, j] - 1) * v.iloc[j,0]
                den = den + ((ct.iloc[nlvl, j] - 1) * v.iloc[j,0])**2 / (ct.iloc[nlvl, j] - 1)
            
            df2 = num**2 / den
            df1 = k - 1
            p = f.sf(fw, df1, df2)
            
            colnames = ["n", "h", "F statistic", "df1", "df2", "p-value"]
            res = pd.DataFrame([[n, h, fw, df1, df2, p]], columns=colnames)
        
    
    return res