Module `stikpetP.other.poho_residual`

Expand source code

import pandas as pd
from statistics import NormalDist
from ..other.table_cross import tab_cross

def ph_residual(field1, field2, categories1=None, categories2=None, residual="adjusted"):
    '''
    Post-Hoc Residual Test
    ----------------------
    
    Performs a z-test based on the residuals for each cell in a cross table.
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    categories1 : list or dictionary, optional
        the two categories to use from field1. If not set the first two found will be used
    categories2 : list or dictionary, optional
        the two categories to use from field2. If not set the first two found will be used
    residual : {"adjusted", "standardized"}, optional
        which residual to use, either adjusted standardized, or standardized.

    Returns
    -------
    A dataframe with:
    * *field1* and *field2*, the two categories compared
    * *observed*, the observed count in the cell
    * *expected*, the expected count from the cell
    * *residual*, the difference between observed and expected
    * *resUsed*, the residual used    
    * *p-value*, the significance (p-value)
    * *adj. p-value*, the Bonferroni adjusted p-value
    
    Notes
    -----    
    The formula used for the standardized residuals is (Haberman, 1973, p. 205):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}}}$$
    
    This is also referred to a Pearson residuals (Sharpe, 2015, p. 3).
    
    The formula used for the adjusted standardized residuals is (Haberman, 1973, p. 207):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}\\times\\left(1 - \\frac{R_i}{n}\\right)\\times\\left(1 - \\frac{C_j}{n}\\right)}}$$
    
    With:
    $$E_{i,j} = \\frac{R_{i}\\times C_{j}}{n}$$
    $$C_j = \\sum_{i=1}^r F_{i,j}$$
    $$R_i = \\sum_{j=1}^c F_{i,j}$$
    $$n = \\sum_{i=1}^r \\sum_{j=1}^c F_{i,j} = \\sum_{j=1}^c C_{j} = \\sum_{i=1}^r R_i$$
    
    The p-value (sig.) is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(z_{i,j}\\right)\\right)$$
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times r \\times c, 1\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\) the observed count of cell in row i, column j
    * \\(E_{i,j}\\) the expected count of cell in row i, column j
    * \\(R_i\\) the row total of row i
    * \\(C_j\\) the column total of column j
    * \\(r\\) the number of rows
    * \\(c\\) the number of columns
    * \\(n\\) the grand total
    * \\(\Phi\\left(...\\right)\\) the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Haberman, S. J. (1973). The analysis of residuals in cross-classified tables. *Biometrics, 29*(1), 205–220. doi:10.2307/2529686
    
    Sharpe, D. (2015). Your chi-square test is statistically significant: Now what? Practical Assessment, *Research & Evaluation, 20*(8), 1–10.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")
  
    #basic counts
    nrows = ct.shape[0] - 1
    ncols =  ct.shape[1] - 1
    n = ct.iloc[nrows, ncols]
    
    res = pd.DataFrame()
    
    ipair=0
    for i in range(0, nrows):
        for j in range(0, ncols):
            res.loc[ipair, 0] = list(ct.index)[i]
            res.loc[ipair, 1] = list(ct.columns)[j]
            res.loc[ipair, 2] = ct.iloc[i, j]
            
            #expected count
            res.loc[ipair, 3] = ct.iloc[nrows, j] * ct.iloc[i, ncols] / n
            
            #residual
            res.loc[ipair, 4] = res.iloc[ipair, 2] - res.iloc[ipair, 3]
            
            #adjusted
            if (residual == "standardized"):
                se = res.iloc[ipair, 3]**0.5
            elif (residual == "adjusted"):
                se = (res.iloc[ipair, 3] * (1 - ct.iloc[i, ncols] / n) * (1 - ct.iloc[nrows, j] / n))**0.5
            
            res.loc[ipair, 5] = res.iloc[ipair, 4] / se
            
            #p-value
            res.loc[ipair, 6] = 2 * (1 - NormalDist().cdf(abs(res.iloc[ipair, 5])))
            
            #adj
            res.loc[ipair, 7] = res.iloc[ipair, 6] * nrows * ncols
            if (res.iloc[ipair, 7] > 1):
                res.iloc[ipair, 7] = 1
            ipair = ipair + 1
            
    
    #column names
    if (residual == "standardized"):
        resUsed = "st. residual"
    elif (residual == "adjusted"):
        resUsed = "adj. st. residual"        
    colNames = ["field1", "field2", "observed", "expected", "residual", resUsed, "p-value", "adj. p-value"]
    res.columns = colNames
    return (res)

Functions

def ph_residual(field1, field2, categories1=None, categories2=None, residual='adjusted')

Post-Hoc Residual Test

Performs a z-test based on the residuals for each cell in a cross table.

Parameters

field1 : pandas series: data with categories for the rows
field2 : pandas series: data with categories for the columns
categories1 : list or dictionary, optional: the two categories to use from field1. If not set the first two found will be used
categories2 : list or dictionary, optional: the two categories to use from field2. If not set the first two found will be used
residual : {"adjusted", "standardized"}, optional: which residual to use, either adjusted standardized, or standardized.

Returns

A dataframe with:

field1 and field2, the two categories compared
observed, the observed count in the cell
expected, the expected count from the cell
residual, the difference between observed and expected
resUsed, the residual used
p-value, the significance (p-value)
adj. p-value, the Bonferroni adjusted p-value

Notes

The formula used for the standardized residuals is (Haberman, 1973, p. 205): $z_{i,j} = \frac{F_{i,j} - E_{i,j}}{\sqrt{E_{i,j}}}$

This is also referred to a Pearson residuals (Sharpe, 2015, p. 3).

The formula used for the adjusted standardized residuals is (Haberman, 1973, p. 207): $z_{i,j} = \frac{F_{i,j} - E_{i,j}}{\sqrt{E_{i,j}\times\left(1 - \frac{R_i}{n}\right)\times\left(1 - \frac{C_j}{n}\right)}}$

With: $E_{i,j} = \frac{R_{i}\times C_{j}}{n}$ $C_j = \sum_{i=1}^r F_{i,j}$ $R_i = \sum_{j=1}^c F_{i,j}$ $n = \sum_{i=1}^r \sum_{j=1}^c F_{i,j} = \sum_{j=1}^c C_{j} = \sum_{i=1}^r R_i$

The p-value (sig.) is then determined using: $sig. = 2\times\left(1 - \Phi\left(z_{i,j}\right)\right)$

A simple Bonferroni correction is applied for the multiple comparisons. This is simply: $sig._{adj} = \min \left(sig. \times r \times c, 1\right)$

Symbols used:

$F_{i,j}$ the observed count of cell in row i, column j
$E_{i,j}$ the expected count of cell in row i, column j
$R_i$ the row total of row i
$C_j$ the column total of column j
$r$ the number of rows
$c$ the number of columns
$n$ the grand total
$\Phi\left(...\right)$ the cumulative distribution function of the standard normal distribution.

References

Haberman, S. J. (1973). The analysis of residuals in cross-classified tables. Biometrics, 29(1), 205–220. doi:10.2307/2529686

Sharpe, D. (2015). Your chi-square test is statistically significant: Now what? Practical Assessment, Research & Evaluation, 20(8), 1–10.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def ph_residual(field1, field2, categories1=None, categories2=None, residual="adjusted"):
    '''
    Post-Hoc Residual Test
    ----------------------
    
    Performs a z-test based on the residuals for each cell in a cross table.
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    categories1 : list or dictionary, optional
        the two categories to use from field1. If not set the first two found will be used
    categories2 : list or dictionary, optional
        the two categories to use from field2. If not set the first two found will be used
    residual : {"adjusted", "standardized"}, optional
        which residual to use, either adjusted standardized, or standardized.

    Returns
    -------
    A dataframe with:
    * *field1* and *field2*, the two categories compared
    * *observed*, the observed count in the cell
    * *expected*, the expected count from the cell
    * *residual*, the difference between observed and expected
    * *resUsed*, the residual used    
    * *p-value*, the significance (p-value)
    * *adj. p-value*, the Bonferroni adjusted p-value
    
    Notes
    -----    
    The formula used for the standardized residuals is (Haberman, 1973, p. 205):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}}}$$
    
    This is also referred to a Pearson residuals (Sharpe, 2015, p. 3).
    
    The formula used for the adjusted standardized residuals is (Haberman, 1973, p. 207):
    $$z_{i,j} = \\frac{F_{i,j} - E_{i,j}}{\\sqrt{E_{i,j}\\times\\left(1 - \\frac{R_i}{n}\\right)\\times\\left(1 - \\frac{C_j}{n}\\right)}}$$
    
    With:
    $$E_{i,j} = \\frac{R_{i}\\times C_{j}}{n}$$
    $$C_j = \\sum_{i=1}^r F_{i,j}$$
    $$R_i = \\sum_{j=1}^c F_{i,j}$$
    $$n = \\sum_{i=1}^r \\sum_{j=1}^c F_{i,j} = \\sum_{j=1}^c C_{j} = \\sum_{i=1}^r R_i$$
    
    The p-value (sig.) is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(z_{i,j}\\right)\\right)$$
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times r \\times c, 1\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\) the observed count of cell in row i, column j
    * \\(E_{i,j}\\) the expected count of cell in row i, column j
    * \\(R_i\\) the row total of row i
    * \\(C_j\\) the column total of column j
    * \\(r\\) the number of rows
    * \\(c\\) the number of columns
    * \\(n\\) the grand total
    * \\(\Phi\\left(...\\right)\\) the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    Haberman, S. J. (1973). The analysis of residuals in cross-classified tables. *Biometrics, 29*(1), 205–220. doi:10.2307/2529686
    
    Sharpe, D. (2015). Your chi-square test is statistically significant: Now what? Practical Assessment, *Research & Evaluation, 20*(8), 1–10.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")
  
    #basic counts
    nrows = ct.shape[0] - 1
    ncols =  ct.shape[1] - 1
    n = ct.iloc[nrows, ncols]
    
    res = pd.DataFrame()
    
    ipair=0
    for i in range(0, nrows):
        for j in range(0, ncols):
            res.loc[ipair, 0] = list(ct.index)[i]
            res.loc[ipair, 1] = list(ct.columns)[j]
            res.loc[ipair, 2] = ct.iloc[i, j]
            
            #expected count
            res.loc[ipair, 3] = ct.iloc[nrows, j] * ct.iloc[i, ncols] / n
            
            #residual
            res.loc[ipair, 4] = res.iloc[ipair, 2] - res.iloc[ipair, 3]
            
            #adjusted
            if (residual == "standardized"):
                se = res.iloc[ipair, 3]**0.5
            elif (residual == "adjusted"):
                se = (res.iloc[ipair, 3] * (1 - ct.iloc[i, ncols] / n) * (1 - ct.iloc[nrows, j] / n))**0.5
            
            res.loc[ipair, 5] = res.iloc[ipair, 4] / se
            
            #p-value
            res.loc[ipair, 6] = 2 * (1 - NormalDist().cdf(abs(res.iloc[ipair, 5])))
            
            #adj
            res.loc[ipair, 7] = res.iloc[ipair, 6] * nrows * ncols
            if (res.iloc[ipair, 7] > 1):
                res.iloc[ipair, 7] = 1
            ipair = ipair + 1
            
    
    #column names
    if (residual == "standardized"):
        resUsed = "st. residual"
    elif (residual == "adjusted"):
        resUsed = "adj. st. residual"        
    colNames = ["field1", "field2", "observed", "expected", "residual", resUsed, "p-value", "adj. p-value"]
    res.columns = colNames
    return (res)