Module stikpetP.other.poho_residual

Expand source code
import pandas as pd
from statistics import NormalDist
from ..other.table_cross import tab_cross

def ph_residual(field1, field2, categories1=None, categories2=None, residual="adjusted"):
    '''
    Post-Hoc Residual Test
    ----------------------
    
    Performs a z-test based on the residuals for each cell in a cross table.
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    categories1 : list or dictionary, optional
        the two categories to use from field1. If not set the first two found will be used
    categories2 : list or dictionary, optional
        the two categories to use from field2. If not set the first two found will be used
    residual : {"adjusted", "standardized"}, optional
        which residual to use, either adjusted standardized, or standardized.

    Returns
    -------
    A dataframe with:
    * *field1* and *field2*, the two categories compared
    * *observed*, the observed count in the cell
    * *expected*, the expected count from the cell
    * *residual*, the difference between observed and expected
    * *resUsed*, the residual used    
    * *p-value*, the significance (p-value)
    * *adj. p-value*, the Bonferroni adjusted p-value
    
    Notes
    -----    
    The formula used for the standardized residuals is (Haberman, 1973, p. 205):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}}}$$
    
    This is also referred to a Pearson residuals (Sharpe, 2015, p. 3).
    
    The formula used for the adjusted standardized residuals is (Haberman, 1973, p. 207):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}\\times\\left(1 - \\frac{R_i}{n}\\right)\\times\\left(1 - \\frac{C_j}{n}\\right)}}$$
    
    With:
    $$E_{i,j} = \\frac{R_{i}\\times C_{j}}{n}$$
    $$C_j = \\sum_{i=1}^r F_{i,j}$$
    $$R_i = \\sum_{j=1}^c F_{i,j}$$
    $$n = \\sum_{i=1}^r \\sum_{j=1}^c F_{i,j} = \\sum_{j=1}^c C_{j} = \\sum_{i=1}^r R_i$$
    
    The p-value (sig.) is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(z_{i,j}\\right)\\right)$$
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times r \\times c, 1\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\) the observed count of cell in row i, column j
    * \\(E_{i,j}\\) the expected count of cell in row i, column j
    * \\(R_i\\) the row total of row i
    * \\(C_j\\) the column total of column j
    * \\(r\\) the number of rows
    * \\(c\\) the number of columns
    * \\(n\\) the grand total
    * \\(\Phi\\left(...\\right)\\) the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Haberman, S. J. (1973). The analysis of residuals in cross-classified tables. *Biometrics, 29*(1), 205–220. doi:10.2307/2529686
    
    Sharpe, D. (2015). Your chi-square test is statistically significant: Now what? Practical Assessment, *Research & Evaluation, 20*(8), 1–10.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")
  
    #basic counts
    nrows = ct.shape[0] - 1
    ncols =  ct.shape[1] - 1
    n = ct.iloc[nrows, ncols]
    
    res = pd.DataFrame()
    
    ipair=0
    for i in range(0, nrows):
        for j in range(0, ncols):
            res.loc[ipair, 0] = list(ct.index)[i]
            res.loc[ipair, 1] = list(ct.columns)[j]
            res.loc[ipair, 2] = ct.iloc[i, j]
            
            #expected count
            res.loc[ipair, 3] = ct.iloc[nrows, j] * ct.iloc[i, ncols] / n
            
            #residual
            res.loc[ipair, 4] = res.iloc[ipair, 2] - res.iloc[ipair, 3]
            
            #adjusted
            if (residual == "standardized"):
                se = res.iloc[ipair, 3]**0.5
            elif (residual == "adjusted"):
                se = (res.iloc[ipair, 3] * (1 - ct.iloc[i, ncols] / n) * (1 - ct.iloc[nrows, j] / n))**0.5
            
            res.loc[ipair, 5] = res.iloc[ipair, 4] / se
            
            #p-value
            res.loc[ipair, 6] = 2 * (1 - NormalDist().cdf(abs(res.iloc[ipair, 5])))
            
            #adj
            res.loc[ipair, 7] = res.iloc[ipair, 6] * nrows * ncols
            if (res.iloc[ipair, 7] > 1):
                res.iloc[ipair, 7] = 1
            ipair = ipair + 1
            
    
    #column names
    if (residual == "standardized"):
        resUsed = "st. residual"
    elif (residual == "adjusted"):
        resUsed = "adj. st. residual"        
    colNames = ["field1", "field2", "observed", "expected", "residual", resUsed, "p-value", "adj. p-value"]
    res.columns = colNames
    return (res)

Functions

def ph_residual(field1, field2, categories1=None, categories2=None, residual='adjusted')

Post-Hoc Residual Test

Performs a z-test based on the residuals for each cell in a cross table.

Parameters

field1 : pandas series
data with categories for the rows
field2 : pandas series
data with categories for the columns
categories1 : list or dictionary, optional
the two categories to use from field1. If not set the first two found will be used
categories2 : list or dictionary, optional
the two categories to use from field2. If not set the first two found will be used
residual : {"adjusted", "standardized"}, optional
which residual to use, either adjusted standardized, or standardized.

Returns

A dataframe with:
 
  • field1 and field2, the two categories compared
  • observed, the observed count in the cell
  • expected, the expected count from the cell
  • residual, the difference between observed and expected
  • resUsed, the residual used
  • p-value, the significance (p-value)
  • adj. p-value, the Bonferroni adjusted p-value

Notes

The formula used for the standardized residuals is (Haberman, 1973, p. 205): z_{i,j} = \frac{F_{i,j} - E_{i,j}}{\sqrt{E_{i,j}}}

This is also referred to a Pearson residuals (Sharpe, 2015, p. 3).

The formula used for the adjusted standardized residuals is (Haberman, 1973, p. 207): z_{i,j} = \frac{F_{i,j} - E_{i,j}}{\sqrt{E_{i,j}\times\left(1 - \frac{R_i}{n}\right)\times\left(1 - \frac{C_j}{n}\right)}}

With: E_{i,j} = \frac{R_{i}\times C_{j}}{n} C_j = \sum_{i=1}^r F_{i,j} R_i = \sum_{j=1}^c F_{i,j} n = \sum_{i=1}^r \sum_{j=1}^c F_{i,j} = \sum_{j=1}^c C_{j} = \sum_{i=1}^r R_i

The p-value (sig.) is then determined using: sig. = 2\times\left(1 - \Phi\left(z_{i,j}\right)\right)

A simple Bonferroni correction is applied for the multiple comparisons. This is simply: sig._{adj} = \min \left(sig. \times r \times c, 1\right)

Symbols used:

  • F_{i,j} the observed count of cell in row i, column j
  • E_{i,j} the expected count of cell in row i, column j
  • R_i the row total of row i
  • C_j the column total of column j
  • r the number of rows
  • c the number of columns
  • n the grand total
  • \Phi\left(...\right) the cumulative distribution function of the standard normal distribution.

References

Haberman, S. J. (1973). The analysis of residuals in cross-classified tables. Biometrics, 29(1), 205–220. doi:10.2307/2529686

Sharpe, D. (2015). Your chi-square test is statistically significant: Now what? Practical Assessment, Research & Evaluation, 20(8), 1–10.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def ph_residual(field1, field2, categories1=None, categories2=None, residual="adjusted"):
    '''
    Post-Hoc Residual Test
    ----------------------
    
    Performs a z-test based on the residuals for each cell in a cross table.
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    categories1 : list or dictionary, optional
        the two categories to use from field1. If not set the first two found will be used
    categories2 : list or dictionary, optional
        the two categories to use from field2. If not set the first two found will be used
    residual : {"adjusted", "standardized"}, optional
        which residual to use, either adjusted standardized, or standardized.

    Returns
    -------
    A dataframe with:
    * *field1* and *field2*, the two categories compared
    * *observed*, the observed count in the cell
    * *expected*, the expected count from the cell
    * *residual*, the difference between observed and expected
    * *resUsed*, the residual used    
    * *p-value*, the significance (p-value)
    * *adj. p-value*, the Bonferroni adjusted p-value
    
    Notes
    -----    
    The formula used for the standardized residuals is (Haberman, 1973, p. 205):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}}}$$
    
    This is also referred to a Pearson residuals (Sharpe, 2015, p. 3).
    
    The formula used for the adjusted standardized residuals is (Haberman, 1973, p. 207):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}\\times\\left(1 - \\frac{R_i}{n}\\right)\\times\\left(1 - \\frac{C_j}{n}\\right)}}$$
    
    With:
    $$E_{i,j} = \\frac{R_{i}\\times C_{j}}{n}$$
    $$C_j = \\sum_{i=1}^r F_{i,j}$$
    $$R_i = \\sum_{j=1}^c F_{i,j}$$
    $$n = \\sum_{i=1}^r \\sum_{j=1}^c F_{i,j} = \\sum_{j=1}^c C_{j} = \\sum_{i=1}^r R_i$$
    
    The p-value (sig.) is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(z_{i,j}\\right)\\right)$$
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times r \\times c, 1\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\) the observed count of cell in row i, column j
    * \\(E_{i,j}\\) the expected count of cell in row i, column j
    * \\(R_i\\) the row total of row i
    * \\(C_j\\) the column total of column j
    * \\(r\\) the number of rows
    * \\(c\\) the number of columns
    * \\(n\\) the grand total
    * \\(\Phi\\left(...\\right)\\) the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Haberman, S. J. (1973). The analysis of residuals in cross-classified tables. *Biometrics, 29*(1), 205–220. doi:10.2307/2529686
    
    Sharpe, D. (2015). Your chi-square test is statistically significant: Now what? Practical Assessment, *Research & Evaluation, 20*(8), 1–10.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")
  
    #basic counts
    nrows = ct.shape[0] - 1
    ncols =  ct.shape[1] - 1
    n = ct.iloc[nrows, ncols]
    
    res = pd.DataFrame()
    
    ipair=0
    for i in range(0, nrows):
        for j in range(0, ncols):
            res.loc[ipair, 0] = list(ct.index)[i]
            res.loc[ipair, 1] = list(ct.columns)[j]
            res.loc[ipair, 2] = ct.iloc[i, j]
            
            #expected count
            res.loc[ipair, 3] = ct.iloc[nrows, j] * ct.iloc[i, ncols] / n
            
            #residual
            res.loc[ipair, 4] = res.iloc[ipair, 2] - res.iloc[ipair, 3]
            
            #adjusted
            if (residual == "standardized"):
                se = res.iloc[ipair, 3]**0.5
            elif (residual == "adjusted"):
                se = (res.iloc[ipair, 3] * (1 - ct.iloc[i, ncols] / n) * (1 - ct.iloc[nrows, j] / n))**0.5
            
            res.loc[ipair, 5] = res.iloc[ipair, 4] / se
            
            #p-value
            res.loc[ipair, 6] = 2 * (1 - NormalDist().cdf(abs(res.iloc[ipair, 5])))
            
            #adj
            res.loc[ipair, 7] = res.iloc[ipair, 6] * nrows * ncols
            if (res.iloc[ipair, 7] > 1):
                res.iloc[ipair, 7] = 1
            ipair = ipair + 1
            
    
    #column names
    if (residual == "standardized"):
        resUsed = "st. residual"
    elif (residual == "adjusted"):
        resUsed = "adj. st. residual"        
    colNames = ["field1", "field2", "observed", "expected", "residual", resUsed, "p-value", "adj. p-value"]
    res.columns = colNames
    return (res)