Module stikpetP.other.poho_conover_iman
Expand source code
import pandas as pd
from scipy.stats import t 
from ..other.table_cross import tab_cross
def ph_conover_iman(catField, ordField, categories=None, levels=None):
    '''
    Post-Hoc Conover-Iman Test
    --------------------------
    This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).
    
    The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.
    
    Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
        
    Returns
    -------
    A dataframe with:
    
    * *cat. 1*, one of the two categories being compared
    * *cat. 2*, second of the two categories being compared
    * *n1*, number of cat. 1. cases in comparison
    * *n2*, number of cat. 2 cases in comparison
    * *mean rank 1*, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
    * *mean rank 2*, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
    * *statistic*, the t-value of the test
    * *df*, the degrees of freedom
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used is (Conover & Iman, 1979, p. 11):
    $$t_{1,2} = \\frac{\\bar{r}_1 - \\bar{r}_2}{\\sqrt{S^2\\times\\frac{n-1-T}{n-k}\\times\\left(\\frac{1}{n_1}+\\frac{1}{n_2}\\right)}}$$
    $$df = n - k$$
    $$sig. = 1 - T\\left(\\left|t_{1,2}\\right|, df\\right)$$
    
    With:
    $$S^2=\\frac{\\sum_{j=1}^k \\sum_{i=1}^{n_j} r_{i,j}^2 - \\frac{n\\times\\left(n+1\\right)^2}{4}}{n-1}$$
    $$T = \\frac{\\sum_{i=1}^k \\frac{R_i^2}{n_i} - \\frac{n\\times\\left(n+1\\right)^2}{4}}{S^2}$$
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    Note that \\(S^2, T, k, n\\) are all based on all scores, including those not in the selected pair.
    
    The formula can also be found in Conover (1980, pp. 230-231).
    
    *Symbols used*
    
    * \\(k\\), the number of categories
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i, using all original scores (incl. those not in the comparison).
    * \\(T\\left(\\dots\\right)\\), the cumulative distribution function of the Student t distribution.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley. 
    
    Conover, W. J., & Iman, R. L. (1979). *On multiple-comparisons procedures* (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table    
    ct = tab_cross(ordField, catField, order1=levels, order2=categories, totals="include")
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.Series(dtype="object")
    cf = 0
    for i in range(0,nlvl):
        lvlRank.at[i] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k]
    
    #sum of ranks per category
    srj = pd.Series(dtype="object")
    srj2=0
    T = 0
    for j in range(0,k):
        sr = 0
        for i in range(0,nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i]
            srj2 = srj2 + ct.iloc[i, j] * lvlRank.iloc[i]**2
            
        srj.at[j] = sr
        T = T + sr**2 / ct.iloc[nlvl,j]
    
    ff = n * (n + 1)**2 / 4
    s2 = (srj2 - ff) / (n - 1)
    T = (T - ff) / s2
    ff = s2 * (n - 1 - T) / (n - k)
    df = n - k
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow = 0
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = ct.iloc[nlvl, i]
            n2 = ct.iloc[nlvl, j]
            m1 = srj.iloc[i] / n1
            m2 = srj.iloc[j] / n2
            d = m1 - m2
            
            Var = ff * (1 / n1 + 1 / n2)
            se = (Var)**0.5
            tVal = d / se
            p = 2*(1-t.cdf(abs(tVal), df))  
            
            res.at[resRow, 0] = ct.columns[i]
            res.at[resRow, 1] = ct.columns[j]
            res.at[resRow, 2] = n1
            res.at[resRow, 3] = n2
            res.at[resRow, 4] = m1
            res.at[resRow, 5] = m2
            res.at[resRow, 6] = tVal
            res.at[resRow, 7] = df
            res.at[resRow, 8] = p
            if res.iloc[resRow, 8] > 1:
                res.at[resRow, 8] = 1            
            
            resRow = resRow + 1
    
    colNames = ["cat. 1", "cat. 2", "n1", "n2", "mean rank 1", "mean rank 2", "statistic", "df", "p-value"]
    res.columns = colNames
    return resFunctions
- def ph_conover_iman(catField, ordField, categories=None, levels=None)
- 
Post-Hoc Conover-Iman TestThis can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()). The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing. Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median. Parameters- catField:- pandas series
- data with categories
- ordField:- pandas series
- data with the scores
- categories:- listor- dictionary, optional
- the categories to use from catField
- levels:- listor- dictionary, optional
- the levels or order used in ordField.
 Returns- A dataframe with:
 - cat. 1, one of the two categories being compared
- cat. 2, second of the two categories being compared
- n1, number of cat. 1. cases in comparison
- n2, number of cat. 2 cases in comparison
- mean rank 1, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
- mean rank 2, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
- statistic, the t-value of the test
- df, the degrees of freedom
- p-value, the p-value (significance)
 NotesThe formula used is (Conover & Iman, 1979, p. 11): t_{1,2} = \frac{\bar{r}_1 - \bar{r}_2}{\sqrt{S^2\times\frac{n-1-T}{n-k}\times\left(\frac{1}{n_1}+\frac{1}{n_2}\right)}} df = n - k sig. = 1 - T\left(\left|t_{1,2}\right|, df\right) With: S^2=\frac{\sum_{j=1}^k \sum_{i=1}^{n_j} r_{i,j}^2 - \frac{n\times\left(n+1\right)^2}{4}}{n-1} T = \frac{\sum_{i=1}^k \frac{R_i^2}{n_i} - \frac{n\times\left(n+1\right)^2}{4}}{S^2} R_i = \sum_{j=1}^{n_i} r_{i,j} Note that S^2, T, k, n are all based on all scores, including those not in the selected pair. The formula can also be found in Conover (1980, pp. 230-231). Symbols used - k, the number of categories
- n_i, the number of scores in category i
- r_{i,j}, the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
- R_i, the sum of the ranks in category i
- \bar{r}_i, the average of the ranks in category i, using all original scores (incl. those not in the comparison).
- T\left(\dots\right), the cumulative distribution function of the Student t distribution.
 ReferencesConover, W. J. (1980). Practical nonparametric statistics (2nd ed.). Wiley. Conover, W. J., & Iman, R. L. (1979). On multiple-comparisons procedures (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory. AuthorMade by P. Stikker Companion website: https://PeterStatistics.com 
 YouTube channel: https://www.youtube.com/stikpet
 Donations: https://www.patreon.com/bePatron?u=19398076Expand source codedef ph_conover_iman(catField, ordField, categories=None, levels=None): ''' Post-Hoc Conover-Iman Test -------------------------- This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()). The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing. Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median. Parameters ---------- catField : pandas series data with categories ordField : pandas series data with the scores categories : list or dictionary, optional the categories to use from catField levels : list or dictionary, optional the levels or order used in ordField. Returns ------- A dataframe with: * *cat. 1*, one of the two categories being compared * *cat. 2*, second of the two categories being compared * *n1*, number of cat. 1. cases in comparison * *n2*, number of cat. 2 cases in comparison * *mean rank 1*, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison) * *mean rank 2*, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison) * *statistic*, the t-value of the test * *df*, the degrees of freedom * *p-value*, the p-value (significance) Notes ----- The formula used is (Conover & Iman, 1979, p. 11): $$t_{1,2} = \\frac{\\bar{r}_1 - \\bar{r}_2}{\\sqrt{S^2\\times\\frac{n-1-T}{n-k}\\times\\left(\\frac{1}{n_1}+\\frac{1}{n_2}\\right)}}$$ $$df = n - k$$ $$sig. = 1 - T\\left(\\left|t_{1,2}\\right|, df\\right)$$ With: $$S^2=\\frac{\\sum_{j=1}^k \\sum_{i=1}^{n_j} r_{i,j}^2 - \\frac{n\\times\\left(n+1\\right)^2}{4}}{n-1}$$ $$T = \\frac{\\sum_{i=1}^k \\frac{R_i^2}{n_i} - \\frac{n\\times\\left(n+1\\right)^2}{4}}{S^2}$$ $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$ Note that \\(S^2, T, k, n\\) are all based on all scores, including those not in the selected pair. The formula can also be found in Conover (1980, pp. 230-231). *Symbols used* * \\(k\\), the number of categories * \\(n_i\\), the number of scores in category i * \\(r_{i,j}\\), the rank of the j-th score in category i using all original scores (incl. those not in the comparison). * \\(R_i\\), the sum of the ranks in category i * \\(\\bar{r}_i\\), the average of the ranks in category i, using all original scores (incl. those not in the comparison). * \\(T\\left(\\dots\\right)\\), the cumulative distribution function of the Student t distribution. References ---------- Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley. Conover, W. J., & Iman, R. L. (1979). *On multiple-comparisons procedures* (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory. Author ------ Made by P. Stikker Companion website: https://PeterStatistics.com YouTube channel: https://www.youtube.com/stikpet Donations: https://www.patreon.com/bePatron?u=19398076 ''' #create the cross table ct = tab_cross(ordField, catField, order1=levels, order2=categories, totals="include") #basic counts k = ct.shape[1]-1 nlvl = ct.shape[0]-1 n = ct.iloc[nlvl, k] #the ranks of the levels lvlRank = pd.Series(dtype="object") cf = 0 for i in range(0,nlvl): lvlRank.at[i] = (2 * cf + ct.iloc[i, k] + 1) / 2 cf = cf + ct.iloc[i, k] #sum of ranks per category srj = pd.Series(dtype="object") srj2=0 T = 0 for j in range(0,k): sr = 0 for i in range(0,nlvl): sr = sr + ct.iloc[i, j] * lvlRank.iloc[i] srj2 = srj2 + ct.iloc[i, j] * lvlRank.iloc[i]**2 srj.at[j] = sr T = T + sr**2 / ct.iloc[nlvl,j] ff = n * (n + 1)**2 / 4 s2 = (srj2 - ff) / (n - 1) T = (T - ff) / s2 ff = s2 * (n - 1 - T) / (n - k) df = n - k ncomp = k * (k - 1) / 2 res = pd.DataFrame() resRow = 0 for i in range(0, k-1): for j in range(i+1, k): n1 = ct.iloc[nlvl, i] n2 = ct.iloc[nlvl, j] m1 = srj.iloc[i] / n1 m2 = srj.iloc[j] / n2 d = m1 - m2 Var = ff * (1 / n1 + 1 / n2) se = (Var)**0.5 tVal = d / se p = 2*(1-t.cdf(abs(tVal), df)) res.at[resRow, 0] = ct.columns[i] res.at[resRow, 1] = ct.columns[j] res.at[resRow, 2] = n1 res.at[resRow, 3] = n2 res.at[resRow, 4] = m1 res.at[resRow, 5] = m2 res.at[resRow, 6] = tVal res.at[resRow, 7] = df res.at[resRow, 8] = p if res.iloc[resRow, 8] > 1: res.at[resRow, 8] = 1 resRow = resRow + 1 colNames = ["cat. 1", "cat. 2", "n1", "n2", "mean rank 1", "mean rank 2", "statistic", "df", "p-value"] res.columns = colNames return res