Module `stikpetP.other.poho_conover_iman`

Expand source code

import pandas as pd
from scipy.stats import t 
from ..other.table_cross import tab_cross

def ph_conover_iman(catField, ordField, categories=None, levels=None):
    '''
    Post-Hoc Conover-Iman Test
    --------------------------
    This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).
    
    The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.
    
    Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
        
    Returns
    -------
    A dataframe with:
    
    * *cat. 1*, one of the two categories being compared
    * *cat. 2*, second of the two categories being compared
    * *n1*, number of cat. 1. cases in comparison
    * *n2*, number of cat. 2 cases in comparison
    * *mean rank 1*, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
    * *mean rank 2*, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
    * *statistic*, the t-value of the test
    * *df*, the degrees of freedom
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used is (Conover & Iman, 1979, p. 11):
    $$t_{1,2} = \\frac{\\bar{r}_1 - \\bar{r}_2}{\\sqrt{S^2\\times\\frac{n-1-T}{n-k}\\times\\left(\\frac{1}{n_1}+\\frac{1}{n_2}\\right)}}$$
    $$df = n - k$$
    $$sig. = 1 - T\\left(\\left|t_{1,2}\\right|, df\\right)$$
    
    With:
    $$S^2=\\frac{\\sum_{j=1}^k \\sum_{i=1}^{n_j} r_{i,j}^2 - \\frac{n\\times\\left(n+1\\right)^2}{4}}{n-1}$$
    $$T = \\frac{\\sum_{i=1}^k \\frac{R_i^2}{n_i} - \\frac{n\\times\\left(n+1\\right)^2}{4}}{S^2}$$
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    Note that \\(S^2, T, k, n\\) are all based on all scores, including those not in the selected pair.
    
    The formula can also be found in Conover (1980, pp. 230-231).
    
    *Symbols used*
    
    * \\(k\\), the number of categories
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i, using all original scores (incl. those not in the comparison).
    * \\(T\\left(\\dots\\right)\\), the cumulative distribution function of the Student t distribution.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley. 
    
    Conover, W. J., & Iman, R. L. (1979). *On multiple-comparisons procedures* (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table    
    ct = tab_cross(ordField, catField, order1=levels, order2=categories, totals="include")
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.Series(dtype="object")
    cf = 0
    for i in range(0,nlvl):
        lvlRank.at[i] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k]
    
    #sum of ranks per category
    srj = pd.Series(dtype="object")
    srj2=0
    T = 0
    for j in range(0,k):
        sr = 0
        for i in range(0,nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i]
            srj2 = srj2 + ct.iloc[i, j] * lvlRank.iloc[i]**2
            
        srj.at[j] = sr
        T = T + sr**2 / ct.iloc[nlvl,j]
    
    ff = n * (n + 1)**2 / 4
    s2 = (srj2 - ff) / (n - 1)
    T = (T - ff) / s2
    ff = s2 * (n - 1 - T) / (n - k)
    df = n - k
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow = 0
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = ct.iloc[nlvl, i]
            n2 = ct.iloc[nlvl, j]
            m1 = srj.iloc[i] / n1
            m2 = srj.iloc[j] / n2
            d = m1 - m2
            
            Var = ff * (1 / n1 + 1 / n2)
            se = (Var)**0.5
            tVal = d / se
            p = 2*(1-t.cdf(abs(tVal), df))  
            
            res.at[resRow, 0] = ct.columns[i]
            res.at[resRow, 1] = ct.columns[j]
            res.at[resRow, 2] = n1
            res.at[resRow, 3] = n2
            res.at[resRow, 4] = m1
            res.at[resRow, 5] = m2
            res.at[resRow, 6] = tVal
            res.at[resRow, 7] = df
            res.at[resRow, 8] = p
            if res.iloc[resRow, 8] > 1:
                res.at[resRow, 8] = 1            
            
            resRow = resRow + 1
    
    colNames = ["cat. 1", "cat. 2", "n1", "n2", "mean rank 1", "mean rank 2", "statistic", "df", "p-value"]
    res.columns = colNames
    return res

Functions

def ph_conover_iman(catField, ordField, categories=None, levels=None)

Post-Hoc Conover-Iman Test

This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).

The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.

Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.

Parameters

catField : pandas series: data with categories
ordField : pandas series: data with the scores
categories : list or dictionary, optional: the categories to use from catField
levels : list or dictionary, optional: the levels or order used in ordField.

Returns

A dataframe with:

cat. 1, one of the two categories being compared
cat. 2, second of the two categories being compared
n1, number of cat. 1. cases in comparison
n2, number of cat. 2 cases in comparison
mean rank 1, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
mean rank 2, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
statistic, the t-value of the test
df, the degrees of freedom
p-value, the p-value (significance)

Notes

The formula used is (Conover & Iman, 1979, p. 11): $t_{1,2} = \frac{\bar{r}_1 - \bar{r}_2}{\sqrt{S^2\times\frac{n-1-T}{n-k}\times\left(\frac{1}{n_1}+\frac{1}{n_2}\right)}}$ $df = n - k$ $sig. = 1 - T\left(\left|t_{1,2}\right|, df\right)$

With: $S^2=\frac{\sum_{j=1}^k \sum_{i=1}^{n_j} r_{i,j}^2 - \frac{n\times\left(n+1\right)^2}{4}}{n-1}$ $T = \frac{\sum_{i=1}^k \frac{R_i^2}{n_i} - \frac{n\times\left(n+1\right)^2}{4}}{S^2}$ $R_i = \sum_{j=1}^{n_i} r_{i,j}$

Note that $S^2, T, k, n$ are all based on all scores, including those not in the selected pair.

The formula can also be found in Conover (1980, pp. 230-231).

Symbols used

$k$ , the number of categories
$n_i$ , the number of scores in category i
$r_{i,j}$ , the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
$R_i$ , the sum of the ranks in category i
$\bar{r}_i$ , the average of the ranks in category i, using all original scores (incl. those not in the comparison).
$T\left(\dots\right)$ , the cumulative distribution function of the Student t distribution.

References

Conover, W. J. (1980). Practical nonparametric statistics (2nd ed.). Wiley.

Conover, W. J., & Iman, R. L. (1979). On multiple-comparisons procedures (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def ph_conover_iman(catField, ordField, categories=None, levels=None):
    '''
    Post-Hoc Conover-Iman Test
    --------------------------
    This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).
    
    The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.
    
    Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
        
    Returns
    -------
    A dataframe with:
    
    * *cat. 1*, one of the two categories being compared
    * *cat. 2*, second of the two categories being compared
    * *n1*, number of cat. 1. cases in comparison
    * *n2*, number of cat. 2 cases in comparison
    * *mean rank 1*, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
    * *mean rank 2*, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
    * *statistic*, the t-value of the test
    * *df*, the degrees of freedom
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used is (Conover & Iman, 1979, p. 11):
    $$t_{1,2} = \\frac{\\bar{r}_1 - \\bar{r}_2}{\\sqrt{S^2\\times\\frac{n-1-T}{n-k}\\times\\left(\\frac{1}{n_1}+\\frac{1}{n_2}\\right)}}$$
    $$df = n - k$$
    $$sig. = 1 - T\\left(\\left|t_{1,2}\\right|, df\\right)$$
    
    With:
    $$S^2=\\frac{\\sum_{j=1}^k \\sum_{i=1}^{n_j} r_{i,j}^2 - \\frac{n\\times\\left(n+1\\right)^2}{4}}{n-1}$$
    $$T = \\frac{\\sum_{i=1}^k \\frac{R_i^2}{n_i} - \\frac{n\\times\\left(n+1\\right)^2}{4}}{S^2}$$
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    Note that \\(S^2, T, k, n\\) are all based on all scores, including those not in the selected pair.
    
    The formula can also be found in Conover (1980, pp. 230-231).
    
    *Symbols used*
    
    * \\(k\\), the number of categories
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i, using all original scores (incl. those not in the comparison).
    * \\(T\\left(\\dots\\right)\\), the cumulative distribution function of the Student t distribution.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley. 
    
    Conover, W. J., & Iman, R. L. (1979). *On multiple-comparisons procedures* (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table    
    ct = tab_cross(ordField, catField, order1=levels, order2=categories, totals="include")
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.Series(dtype="object")
    cf = 0
    for i in range(0,nlvl):
        lvlRank.at[i] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k]
    
    #sum of ranks per category
    srj = pd.Series(dtype="object")
    srj2=0
    T = 0
    for j in range(0,k):
        sr = 0
        for i in range(0,nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i]
            srj2 = srj2 + ct.iloc[i, j] * lvlRank.iloc[i]**2
            
        srj.at[j] = sr
        T = T + sr**2 / ct.iloc[nlvl,j]
    
    ff = n * (n + 1)**2 / 4
    s2 = (srj2 - ff) / (n - 1)
    T = (T - ff) / s2
    ff = s2 * (n - 1 - T) / (n - k)
    df = n - k
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow = 0
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = ct.iloc[nlvl, i]
            n2 = ct.iloc[nlvl, j]
            m1 = srj.iloc[i] / n1
            m2 = srj.iloc[j] / n2
            d = m1 - m2
            
            Var = ff * (1 / n1 + 1 / n2)
            se = (Var)**0.5
            tVal = d / se
            p = 2*(1-t.cdf(abs(tVal), df))  
            
            res.at[resRow, 0] = ct.columns[i]
            res.at[resRow, 1] = ct.columns[j]
            res.at[resRow, 2] = n1
            res.at[resRow, 3] = n2
            res.at[resRow, 4] = m1
            res.at[resRow, 5] = m2
            res.at[resRow, 6] = tVal
            res.at[resRow, 7] = df
            res.at[resRow, 8] = p
            if res.iloc[resRow, 8] > 1:
                res.at[resRow, 8] = 1            
            
            resRow = resRow + 1
    
    colNames = ["cat. 1", "cat. 2", "n1", "n2", "mean rank 1", "mean rank 2", "statistic", "df", "p-value"]
    res.columns = colNames
    return res