Module stikpetP.other.poho_conover_iman

Expand source code
import pandas as pd
from scipy.stats import t 
from ..other.table_cross import tab_cross

def ph_conover_iman(catField, ordField, categories=None, levels=None):
    '''
    Post-Hoc Conover-Iman Test
    --------------------------
    This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).
    
    The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.
    
    Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
        
    Returns
    -------
    A dataframe with:
    
    * *cat. 1*, one of the two categories being compared
    * *cat. 2*, second of the two categories being compared
    * *n1*, number of cat. 1. cases in comparison
    * *n2*, number of cat. 2 cases in comparison
    * *mean rank 1*, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
    * *mean rank 2*, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
    * *statistic*, the t-value of the test
    * *df*, the degrees of freedom
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used is (Conover & Iman, 1979, p. 11):
    $$t_{1,2} = \\frac{\\bar{r}_1 - \\bar{r}_2}{\\sqrt{S^2\\times\\frac{n-1-T}{n-k}\\times\\left(\\frac{1}{n_1}+\\frac{1}{n_2}\\right)}}$$
    $$df = n - k$$
    $$sig. = 1 - T\\left(\\left|t_{1,2}\\right|, df\\right)$$
    
    With:
    $$S^2=\\frac{\\sum_{j=1}^k \\sum_{i=1}^{n_j} r_{i,j}^2 - \\frac{n\\times\\left(n+1\\right)^2}{4}}{n-1}$$
    $$T = \\frac{\\sum_{i=1}^k \\frac{R_i^2}{n_i} - \\frac{n\\times\\left(n+1\\right)^2}{4}}{S^2}$$
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    Note that \\(S^2, T, k, n\\) are all based on all scores, including those not in the selected pair.
    
    The formula can also be found in Conover (1980, pp. 230-231).
    
    *Symbols used*
    
    * \\(k\\), the number of categories
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i, using all original scores (incl. those not in the comparison).
    * \\(T\\left(\\dots\\right)\\), the cumulative distribution function of the Student t distribution.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley. 
    
    Conover, W. J., & Iman, R. L. (1979). *On multiple-comparisons procedures* (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table    
    ct = tab_cross(ordField, catField, order1=levels, order2=categories, totals="include")
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.Series(dtype="object")
    cf = 0
    for i in range(0,nlvl):
        lvlRank.at[i] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k]
    
    #sum of ranks per category
    srj = pd.Series(dtype="object")
    srj2=0
    T = 0
    for j in range(0,k):
        sr = 0
        for i in range(0,nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i]
            srj2 = srj2 + ct.iloc[i, j] * lvlRank.iloc[i]**2
            
        srj.at[j] = sr
        T = T + sr**2 / ct.iloc[nlvl,j]
    
    ff = n * (n + 1)**2 / 4
    s2 = (srj2 - ff) / (n - 1)
    T = (T - ff) / s2
    ff = s2 * (n - 1 - T) / (n - k)
    df = n - k
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow = 0
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = ct.iloc[nlvl, i]
            n2 = ct.iloc[nlvl, j]
            m1 = srj.iloc[i] / n1
            m2 = srj.iloc[j] / n2
            d = m1 - m2
            
            Var = ff * (1 / n1 + 1 / n2)
            se = (Var)**0.5
            tVal = d / se
            p = 2*(1-t.cdf(abs(tVal), df))  
            
            res.at[resRow, 0] = ct.columns[i]
            res.at[resRow, 1] = ct.columns[j]
            res.at[resRow, 2] = n1
            res.at[resRow, 3] = n2
            res.at[resRow, 4] = m1
            res.at[resRow, 5] = m2
            res.at[resRow, 6] = tVal
            res.at[resRow, 7] = df
            res.at[resRow, 8] = p
            if res.iloc[resRow, 8] > 1:
                res.at[resRow, 8] = 1            
            
            resRow = resRow + 1
    
    colNames = ["cat. 1", "cat. 2", "n1", "n2", "mean rank 1", "mean rank 2", "statistic", "df", "p-value"]
    res.columns = colNames
    return res

Functions

def ph_conover_iman(catField, ordField, categories=None, levels=None)

Post-Hoc Conover-Iman Test

This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).

The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.

Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.

Parameters

catField : pandas series
data with categories
ordField : pandas series
data with the scores
categories : list or dictionary, optional
the categories to use from catField
levels : list or dictionary, optional
the levels or order used in ordField.

Returns

A dataframe with:
 
  • cat. 1, one of the two categories being compared
  • cat. 2, second of the two categories being compared
  • n1, number of cat. 1. cases in comparison
  • n2, number of cat. 2 cases in comparison
  • mean rank 1, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
  • mean rank 2, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
  • statistic, the t-value of the test
  • df, the degrees of freedom
  • p-value, the p-value (significance)

Notes

The formula used is (Conover & Iman, 1979, p. 11): t_{1,2} = \frac{\bar{r}_1 - \bar{r}_2}{\sqrt{S^2\times\frac{n-1-T}{n-k}\times\left(\frac{1}{n_1}+\frac{1}{n_2}\right)}} df = n - k sig. = 1 - T\left(\left|t_{1,2}\right|, df\right)

With: S^2=\frac{\sum_{j=1}^k \sum_{i=1}^{n_j} r_{i,j}^2 - \frac{n\times\left(n+1\right)^2}{4}}{n-1} T = \frac{\sum_{i=1}^k \frac{R_i^2}{n_i} - \frac{n\times\left(n+1\right)^2}{4}}{S^2} R_i = \sum_{j=1}^{n_i} r_{i,j}

Note that S^2, T, k, n are all based on all scores, including those not in the selected pair.

The formula can also be found in Conover (1980, pp. 230-231).

Symbols used

  • k, the number of categories
  • n_i, the number of scores in category i
  • r_{i,j}, the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
  • R_i, the sum of the ranks in category i
  • \bar{r}_i, the average of the ranks in category i, using all original scores (incl. those not in the comparison).
  • T\left(\dots\right), the cumulative distribution function of the Student t distribution.

References

Conover, W. J. (1980). Practical nonparametric statistics (2nd ed.). Wiley.

Conover, W. J., & Iman, R. L. (1979). On multiple-comparisons procedures (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def ph_conover_iman(catField, ordField, categories=None, levels=None):
    '''
    Post-Hoc Conover-Iman Test
    --------------------------
    This can be used as a post-hoc test for a Kruskal-Wallis test (see ts_kruskal_wallis()).
    
    The test compares each possible pair of categories from the catField and their mean rank. The null hypothesis is that these are then equal. A simple Bonferroni adjustment is also made for the multiple testing.
    
    Other post-hoc tests that could be considered are Dunn, Nemenyi, Steel-Dwass, a pairwise Mann-Whitney U, or pairwise Mood-Median.
    
    Parameters
    ----------
    catField : pandas series
        data with categories
    ordField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
    levels : list or dictionary, optional
        the levels or order used in ordField.
        
    Returns
    -------
    A dataframe with:
    
    * *cat. 1*, one of the two categories being compared
    * *cat. 2*, second of the two categories being compared
    * *n1*, number of cat. 1. cases in comparison
    * *n2*, number of cat. 2 cases in comparison
    * *mean rank 1*, mean rank of cases in cat. 1, based on all cases (incl. categories not in comparison)
    * *mean rank 2*, mean rank of cases in cat. 2, based on all cases (incl. categories not in comparison)
    * *statistic*, the t-value of the test
    * *df*, the degrees of freedom
    * *p-value*, the p-value (significance)
    
    Notes
    -----
    The formula used is (Conover & Iman, 1979, p. 11):
    $$t_{1,2} = \\frac{\\bar{r}_1 - \\bar{r}_2}{\\sqrt{S^2\\times\\frac{n-1-T}{n-k}\\times\\left(\\frac{1}{n_1}+\\frac{1}{n_2}\\right)}}$$
    $$df = n - k$$
    $$sig. = 1 - T\\left(\\left|t_{1,2}\\right|, df\\right)$$
    
    With:
    $$S^2=\\frac{\\sum_{j=1}^k \\sum_{i=1}^{n_j} r_{i,j}^2 - \\frac{n\\times\\left(n+1\\right)^2}{4}}{n-1}$$
    $$T = \\frac{\\sum_{i=1}^k \\frac{R_i^2}{n_i} - \\frac{n\\times\\left(n+1\\right)^2}{4}}{S^2}$$
    $$R_i = \\sum_{j=1}^{n_i} r_{i,j}$$
    
    Note that \\(S^2, T, k, n\\) are all based on all scores, including those not in the selected pair.
    
    The formula can also be found in Conover (1980, pp. 230-231).
    
    *Symbols used*
    
    * \\(k\\), the number of categories
    * \\(n_i\\), the number of scores in category i
    * \\(r_{i,j}\\), the rank of the j-th score in category i using all original scores (incl. those not in the comparison).
    * \\(R_i\\), the sum of the ranks in category i
    * \\(\\bar{r}_i\\), the average of the ranks in category i, using all original scores (incl. those not in the comparison).
    * \\(T\\left(\\dots\\right)\\), the cumulative distribution function of the Student t distribution.
    
    References
    ----------
    Conover, W. J. (1980). *Practical nonparametric statistics* (2nd ed.). Wiley. 
    
    Conover, W. J., & Iman, R. L. (1979). *On multiple-comparisons procedures* (LA-7677-MS; pp. 1–14). Los Alamos Scientific Laboratory.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table    
    ct = tab_cross(ordField, catField, order1=levels, order2=categories, totals="include")
    
    #basic counts
    k = ct.shape[1]-1
    nlvl = ct.shape[0]-1
    n = ct.iloc[nlvl, k]
    
    #the ranks of the levels
    lvlRank = pd.Series(dtype="object")
    cf = 0
    for i in range(0,nlvl):
        lvlRank.at[i] = (2 * cf + ct.iloc[i, k] + 1) / 2
        cf = cf + ct.iloc[i, k]
    
    #sum of ranks per category
    srj = pd.Series(dtype="object")
    srj2=0
    T = 0
    for j in range(0,k):
        sr = 0
        for i in range(0,nlvl):
            sr = sr + ct.iloc[i, j] * lvlRank.iloc[i]
            srj2 = srj2 + ct.iloc[i, j] * lvlRank.iloc[i]**2
            
        srj.at[j] = sr
        T = T + sr**2 / ct.iloc[nlvl,j]
    
    ff = n * (n + 1)**2 / 4
    s2 = (srj2 - ff) / (n - 1)
    T = (T - ff) / s2
    ff = s2 * (n - 1 - T) / (n - k)
    df = n - k
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow = 0
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = ct.iloc[nlvl, i]
            n2 = ct.iloc[nlvl, j]
            m1 = srj.iloc[i] / n1
            m2 = srj.iloc[j] / n2
            d = m1 - m2
            
            Var = ff * (1 / n1 + 1 / n2)
            se = (Var)**0.5
            tVal = d / se
            p = 2*(1-t.cdf(abs(tVal), df))  
            
            res.at[resRow, 0] = ct.columns[i]
            res.at[resRow, 1] = ct.columns[j]
            res.at[resRow, 2] = n1
            res.at[resRow, 3] = n2
            res.at[resRow, 4] = m1
            res.at[resRow, 5] = m2
            res.at[resRow, 6] = tVal
            res.at[resRow, 7] = df
            res.at[resRow, 8] = p
            if res.iloc[resRow, 8] > 1:
                res.at[resRow, 8] = 1            
            
            resRow = resRow + 1
    
    colNames = ["cat. 1", "cat. 2", "n1", "n2", "mean rank 1", "mean rank 2", "statistic", "df", "p-value"]
    res.columns = colNames
    return res