Module stikpetP.other.poho_column_proportion

Expand source code
import pandas as pd
from statistics import NormalDist
from ..other.table_cross import tab_cross

def ph_column_proportion(field1, field2, categories1=None, categories2=None, seMethod = "spss"):
    '''
    Post-Hoc Column Proportion Test
    -------------------------------
    
    A test to compare for each row the percentages based on the column totals.
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    categories1 : list or dictionary, optional
        the two categories to use from field1. If not set the first two found will be used
    categories2 : list or dictionary, optional
        the two categories to use from field2. If not set the first two found will be used
    seMethod : {"spss", "marascuilo"}, optional
        which method to use for the standard error.
        
    Returns
    -------
    A dataframe with:
    * *field1* the row of which the column percentages are compared
    * *field2-1* one of two categories being compared to *field2-2*
    * *field2-2* one of two categories being compared to *field2-1*
    * *col. prop. 1*, the column proportion of *field2-1*
    * *col. prop. 2*, the column proportion of *field2-2*
    * *difference*, the difference between the two column proportions
    * *z-value*, the test statistic   
    * *p-value*, the significance (p-value)
    * *adj. p-value*, the Bonferroni adjusted p-value    
    
    Notes
    -----    
    The formula used for the marascuilo (Marascuilo, 1971, p. 381):
    $$z_{i,j,k} = \\frac{\\tilde{p}_{i,j} - \\tilde{p}_{i,k}}{\\sqrt{\\frac{\\tilde{p}_{i,j}\\times\\left(1 - \\tilde{p}_{i,j}\\right)}{C_j} + \\frac{\\tilde{p}_{i,k}\\times\\left(1 - \\tilde{p}_{i,k}\\right)}{C_k}}}$$
    
    The formula used for SPSS (IBM, 2011, pp. 169-170):
    $$z_{i,j,k} = \\frac{\\tilde{p}_{i,j} - \\tilde{p}_{i,k}}{\\sqrt{\\hat{p}_{i,j,k}\\times\\left(1-\\hat{p}_{i,j,k}\\right)\\times\\left(\\frac{1}{C_j} + \\frac{1}{C_k}\\right)}}$$
    
    With:
    $$\\tilde{p}_{i,x} = \\frac{F_{i,j}}{C_x}$$
    $$\\hat{p}_{i,j,k} = \\frac{C_j \\times \\tilde{p}_{i,j} - C_k \\times \\tilde{p}_{i,k}}{C_j + C_k}$$
    $$C_j = \\sum_{i=1}^r F_{i,j}$$
    
    The p-value (sig.) is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(z_{i,j}\\right)\\right)$$
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times r \\times c, 1\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\) the observed count of cell in row i, column j
    * \\(R_i\\) the row total of row i
    * \\(C_j\\) the column total of column j
    * \\(r\\) the number of rows
    * \\(c\\) the number of columns
    * \\(n\\) the grand total
    * \\(\Phi\\left(...\\right)\\) the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    IBM. (2011). IBM SPSS Statistics 20 Algorithms. IBM.
    
    Marascuilo, L. A. (1971). *Statistical methods for behavioral science research*. McGraw-Hill.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")
    
    #basic counts
    nrows = ct.shape[0] - 1
    ncols =  ct.shape[1] - 1
    n = ct.iloc[nrows, ncols]
    
    res = pd.DataFrame()
    ipair=0
    for i in range(0, nrows):
        for j in range(0, ncols-1):
            for k in range((j+1),ncols):
                res.loc[ipair, 0] = list(ct.index)[i]
                res.loc[ipair, 1] = list(ct.columns)[j]
                res.loc[ipair, 2] = list(ct.columns)[k]
        
                #column proportions
                res.loc[ipair, 3] = ct.iloc[i, j] / ct.iloc[nrows, j]
                res.loc[ipair, 4] = ct.iloc[i, k] / ct.iloc[nrows, k]

                #residual
                res.loc[ipair, 5] = res.iloc[ipair, 3] - res.iloc[ipair, 4]
                
                #statistic
                if (seMethod == "spss"):
                    phat = (ct.iloc[nrows, j] * res.iloc[ipair, 3] + ct.iloc[nrows, k] * res.iloc[ipair, 4]) / (ct.iloc[nrows, j] + ct.iloc[nrows, k])
                    se = (phat * (1 - phat) * (1 / ct.iloc[nrows, j] + 1 / ct.iloc[nrows, k]))**0.5
                else:
                    se = (1 / ct[nrows, j] * (res.iloc[ipair, 3] * (1 - res.iloc[ipair, 4])) + 1 / ct.iloc[nrows, k] * (res.iloc[ipair, 4] * (1 - res.iloc[ipair, 3])))**0.5
                    
                res.loc[ipair, 6] = res.iloc[ipair, 5] / se
                
                #p-value
                res.loc[ipair, 7] = 2 * (1 - NormalDist().cdf(abs(res.iloc[ipair, 6])))
                
                #adj
                res.loc[ipair, 8] = res.iloc[ipair, 7] * ncols * (ncols - 1) / 2
                if (res.iloc[ipair, 8] > 1):
                    res.iloc[ipair, 8] = 1

                ipair = ipair + 1
      
    colNames = ["field1", "field2-1", "field2-2", "col. prop. 1", "col. prop. 2", "difference", "z-value", "p-value", "adj. p-value"]
    res.columns = colNames

    return (res)

Functions

def ph_column_proportion(field1, field2, categories1=None, categories2=None, seMethod='spss')

Post-Hoc Column Proportion Test

A test to compare for each row the percentages based on the column totals.

Parameters

field1 : pandas series
data with categories for the rows
field2 : pandas series
data with categories for the columns
categories1 : list or dictionary, optional
the two categories to use from field1. If not set the first two found will be used
categories2 : list or dictionary, optional
the two categories to use from field2. If not set the first two found will be used
seMethod : {"spss", "marascuilo"}, optional
which method to use for the standard error.

Returns

A dataframe with:
 
  • field1 the row of which the column percentages are compared
  • field2-1 one of two categories being compared to field2-2
  • field2-2 one of two categories being compared to field2-1
  • col. prop. 1, the column proportion of field2-1
  • col. prop. 2, the column proportion of field2-2
  • difference, the difference between the two column proportions
  • z-value, the test statistic
  • p-value, the significance (p-value)
  • adj. p-value, the Bonferroni adjusted p-value

Notes

The formula used for the marascuilo (Marascuilo, 1971, p. 381): z_{i,j,k} = \frac{\tilde{p}_{i,j} - \tilde{p}_{i,k}}{\sqrt{\frac{\tilde{p}_{i,j}\times\left(1 - \tilde{p}_{i,j}\right)}{C_j} + \frac{\tilde{p}_{i,k}\times\left(1 - \tilde{p}_{i,k}\right)}{C_k}}}

The formula used for SPSS (IBM, 2011, pp. 169-170): z_{i,j,k} = \frac{\tilde{p}_{i,j} - \tilde{p}_{i,k}}{\sqrt{\hat{p}_{i,j,k}\times\left(1-\hat{p}_{i,j,k}\right)\times\left(\frac{1}{C_j} + \frac{1}{C_k}\right)}}

With: \tilde{p}_{i,x} = \frac{F_{i,j}}{C_x} \hat{p}_{i,j,k} = \frac{C_j \times \tilde{p}_{i,j} - C_k \times \tilde{p}_{i,k}}{C_j + C_k} C_j = \sum_{i=1}^r F_{i,j}

The p-value (sig.) is then determined using: sig. = 2\times\left(1 - \Phi\left(z_{i,j}\right)\right)

A simple Bonferroni correction is applied for the multiple comparisons. This is simply: sig._{adj} = \min \left(sig. \times r \times c, 1\right)

Symbols used:

  • F_{i,j} the observed count of cell in row i, column j
  • R_i the row total of row i
  • C_j the column total of column j
  • r the number of rows
  • c the number of columns
  • n the grand total
  • \Phi\left(...\right) the cumulative distribution function of the standard normal distribution.

References

IBM. (2011). IBM SPSS Statistics 20 Algorithms. IBM.

Marascuilo, L. A. (1971). Statistical methods for behavioral science research. McGraw-Hill.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def ph_column_proportion(field1, field2, categories1=None, categories2=None, seMethod = "spss"):
    '''
    Post-Hoc Column Proportion Test
    -------------------------------
    
    A test to compare for each row the percentages based on the column totals.
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    categories1 : list or dictionary, optional
        the two categories to use from field1. If not set the first two found will be used
    categories2 : list or dictionary, optional
        the two categories to use from field2. If not set the first two found will be used
    seMethod : {"spss", "marascuilo"}, optional
        which method to use for the standard error.
        
    Returns
    -------
    A dataframe with:
    * *field1* the row of which the column percentages are compared
    * *field2-1* one of two categories being compared to *field2-2*
    * *field2-2* one of two categories being compared to *field2-1*
    * *col. prop. 1*, the column proportion of *field2-1*
    * *col. prop. 2*, the column proportion of *field2-2*
    * *difference*, the difference between the two column proportions
    * *z-value*, the test statistic   
    * *p-value*, the significance (p-value)
    * *adj. p-value*, the Bonferroni adjusted p-value    
    
    Notes
    -----    
    The formula used for the marascuilo (Marascuilo, 1971, p. 381):
    $$z_{i,j,k} = \\frac{\\tilde{p}_{i,j} - \\tilde{p}_{i,k}}{\\sqrt{\\frac{\\tilde{p}_{i,j}\\times\\left(1 - \\tilde{p}_{i,j}\\right)}{C_j} + \\frac{\\tilde{p}_{i,k}\\times\\left(1 - \\tilde{p}_{i,k}\\right)}{C_k}}}$$
    
    The formula used for SPSS (IBM, 2011, pp. 169-170):
    $$z_{i,j,k} = \\frac{\\tilde{p}_{i,j} - \\tilde{p}_{i,k}}{\\sqrt{\\hat{p}_{i,j,k}\\times\\left(1-\\hat{p}_{i,j,k}\\right)\\times\\left(\\frac{1}{C_j} + \\frac{1}{C_k}\\right)}}$$
    
    With:
    $$\\tilde{p}_{i,x} = \\frac{F_{i,j}}{C_x}$$
    $$\\hat{p}_{i,j,k} = \\frac{C_j \\times \\tilde{p}_{i,j} - C_k \\times \\tilde{p}_{i,k}}{C_j + C_k}$$
    $$C_j = \\sum_{i=1}^r F_{i,j}$$
    
    The p-value (sig.) is then determined using:
    $$sig. = 2\\times\\left(1 - \\Phi\\left(z_{i,j}\\right)\\right)$$
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times r \\times c, 1\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\) the observed count of cell in row i, column j
    * \\(R_i\\) the row total of row i
    * \\(C_j\\) the column total of column j
    * \\(r\\) the number of rows
    * \\(c\\) the number of columns
    * \\(n\\) the grand total
    * \\(\Phi\\left(...\\right)\\) the cumulative distribution function of the standard normal distribution.
    
    References
    ----------
    IBM. (2011). IBM SPSS Statistics 20 Algorithms. IBM.
    
    Marascuilo, L. A. (1971). *Statistical methods for behavioral science research*. McGraw-Hill.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")
    
    #basic counts
    nrows = ct.shape[0] - 1
    ncols =  ct.shape[1] - 1
    n = ct.iloc[nrows, ncols]
    
    res = pd.DataFrame()
    ipair=0
    for i in range(0, nrows):
        for j in range(0, ncols-1):
            for k in range((j+1),ncols):
                res.loc[ipair, 0] = list(ct.index)[i]
                res.loc[ipair, 1] = list(ct.columns)[j]
                res.loc[ipair, 2] = list(ct.columns)[k]
        
                #column proportions
                res.loc[ipair, 3] = ct.iloc[i, j] / ct.iloc[nrows, j]
                res.loc[ipair, 4] = ct.iloc[i, k] / ct.iloc[nrows, k]

                #residual
                res.loc[ipair, 5] = res.iloc[ipair, 3] - res.iloc[ipair, 4]
                
                #statistic
                if (seMethod == "spss"):
                    phat = (ct.iloc[nrows, j] * res.iloc[ipair, 3] + ct.iloc[nrows, k] * res.iloc[ipair, 4]) / (ct.iloc[nrows, j] + ct.iloc[nrows, k])
                    se = (phat * (1 - phat) * (1 / ct.iloc[nrows, j] + 1 / ct.iloc[nrows, k]))**0.5
                else:
                    se = (1 / ct[nrows, j] * (res.iloc[ipair, 3] * (1 - res.iloc[ipair, 4])) + 1 / ct.iloc[nrows, k] * (res.iloc[ipair, 4] * (1 - res.iloc[ipair, 3])))**0.5
                    
                res.loc[ipair, 6] = res.iloc[ipair, 5] / se
                
                #p-value
                res.loc[ipair, 7] = 2 * (1 - NormalDist().cdf(abs(res.iloc[ipair, 6])))
                
                #adj
                res.loc[ipair, 8] = res.iloc[ipair, 7] * ncols * (ncols - 1) / 2
                if (res.iloc[ipair, 8] > 1):
                    res.iloc[ipair, 8] = 1

                ipair = ipair + 1
      
    colNames = ["field1", "field2-1", "field2-2", "col. prop. 1", "col. prop. 2", "difference", "z-value", "p-value", "adj. p-value"]
    res.columns = colNames

    return (res)