Module `stikpetP.other.table_cross`

Expand source code

import pandas as pd

def tab_cross(field1, field2, order1=None, order2=None, percent=None, totals="exclude", dropEmpty=True):
    '''
    Cross Table / Contingency Table
    -------------------------------
    
    A contingency table can be defined as “tables arising when observations on a number of categorical variables are cross-classified” (Everitt, 2004, p.89).
    
    There are quite a few variations on the name for this type of table. Perhaps the oldest name is actually contingency table, which was the name Pearson (1904, p. 34) gave to them. Another popular name is cross tabulation (Upton & Cook, 2002, p. 79), but also cross classification table (Zekeck, 2014, p. 71) and bivariate frequency table (Porkess, 1988, p. 48) are used. The one I used cross table which can for example be found in Newbold et al. (2013, p. 9) or Sá (2007, p. 52).

    This function is shown in this [YouTube video](https://youtu.be/3tN6zk3hdco) and the table is also discussed at [PeterStatistics.com](https://peterstatistics.com/Terms/Tables/CrossTable.html)
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    order1 : list or dictionary, optional
        order for categories of field1
    order2 : list or dictionary, optional
        order for categories of field2
    percent : {None, "all", "row", "column"}, optional
        which percentages to show. Default is None (will show counts)
    totals : {"exclude", "include"}, optional
        add margin totals. Default is "exclude"
    dropEmpty : boolean, optional
        drop rows and/or columns with only zeros. Default is True
        
    Returns
    -------
    dataframe : the cross table
    
    References
    ----------
    Everitt, B. (2004). *The Cambridge dictionary of statistics* (2nd ed.). Cambridge University Press.
    
    Newbold, P., Carlson, W. L., & Thorne, B. (2013). *Statistics for business and economics* (8th ed). Pearson.
    
    Pearson, K. (1904). *Contributions to the Mathematical Theory of Evolution*. XIII. On the theory of contingency and its relation to association and normal correlation. Dulau and Co.
    
    Porkess, R. (1988). *Dictionary of statistics*. Collins.
    
    Sá, J. P. M. de. (2007). *Applied statistics: Using SPSS, Statistica, MATLAB, and R* (2nd ed.). Springer.
    
    Upton, G., & Cook, I. (2002). *Oxford: Dictionary of statistics*. Oxford University Press.
    
    Zedeck, S. (Ed.). (2014). *APA dictionary of statistics and research methods*. American Psychological Association.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #pd.set_option('future.no_silent_downcasting', True)
    
    if type(field1) is list:
        field1 = pd.Series(field1)
    
    if order1 is not None:
        if type(order1) is dict:
            orderx = {y: x for x, y in order1.items()}
            field1 = field1.replace(orderx)
            
        field1 = pd.Categorical(field1, categories=order1, ordered=True)
        
    if type(field2) is list:
        field2 = pd.Series(field2)
    
    if order2 is not None:
        if type(order2) is dict:
            orderx = {y: x for x, y in order2.items()}
            field2 = field2.replace(orderx)
            
        field2 = pd.Categorical(field2, categories=order2, ordered=True)
        
    tab = pd.crosstab(field1, field2, margins=False, dropna=dropEmpty)    
    
    # pandas crosstab does not add margins properly when NaN are involved, so add them on my own
    rowSums = tab.sum(axis=1)
    colSums = tab.sum(axis=0)
    n = sum(rowSums)
    
    if totals=="include":
        tab['Total'] = rowSums    
        tab.loc['Total', :] = colSums        
        tab.at['Total', 'Total'] = n
        colSums.at['Total'] = n
        rowSums.at['Total'] = n
    
    if percent=="all":
        tab = tab.astype(float)
        tab = tab/n *100
    elif percent=="column":
        tab = tab.astype(float)
        for i in range(0,rowSums.shape[0]):
            for j in range(0,colSums.shape[0]):
                tab.iloc[i,j] = float(tab.iloc[i,j]) / colSums.iloc[j]*100
    elif percent=="row":
        tab = tab.astype(float)
        for i in range(0,rowSums.shape[0]):
            for j in range(0,colSums.shape[0]):
                tab.iloc[i,j] = float(tab.iloc[i,j]) / rowSums.iloc[i]*100    
    
    return (tab)

Functions

def tab_cross(field1, field2, order1=None, order2=None, percent=None, totals='exclude', dropEmpty=True)

Cross Table / Contingency Table

A contingency table can be defined as “tables arising when observations on a number of categorical variables are cross-classified” (Everitt, 2004, p.89).

There are quite a few variations on the name for this type of table. Perhaps the oldest name is actually contingency table, which was the name Pearson (1904, p. 34) gave to them. Another popular name is cross tabulation (Upton & Cook, 2002, p. 79), but also cross classification table (Zekeck, 2014, p. 71) and bivariate frequency table (Porkess, 1988, p. 48) are used. The one I used cross table which can for example be found in Newbold et al. (2013, p. 9) or Sá (2007, p. 52).

This function is shown in this YouTube video and the table is also discussed at PeterStatistics.com

Parameters

field1 : pandas series: data with categories for the rows
field2 : pandas series: data with categories for the columns
order1 : list or dictionary, optional: order for categories of field1
order2 : list or dictionary, optional: order for categories of field2
percent : {None, "all", "row", "column"}, optional: which percentages to show. Default is None (will show counts)
totals : {"exclude", "include"}, optional: add margin totals. Default is "exclude"
dropEmpty : boolean, optional: drop rows and/or columns with only zeros. Default is True

Returns

dataframe : the cross table

References

Everitt, B. (2004). The Cambridge dictionary of statistics (2nd ed.). Cambridge University Press.

Newbold, P., Carlson, W. L., & Thorne, B. (2013). Statistics for business and economics (8th ed). Pearson.

Pearson, K. (1904). Contributions to the Mathematical Theory of Evolution. XIII. On the theory of contingency and its relation to association and normal correlation. Dulau and Co.

Porkess, R. (1988). Dictionary of statistics. Collins.

Sá, J. P. M. de. (2007). Applied statistics: Using SPSS, Statistica, MATLAB, and R (2nd ed.). Springer.

Upton, G., & Cook, I. (2002). Oxford: Dictionary of statistics. Oxford University Press.

Zedeck, S. (Ed.). (2014). APA dictionary of statistics and research methods. American Psychological Association.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def tab_cross(field1, field2, order1=None, order2=None, percent=None, totals="exclude", dropEmpty=True):
    '''
    Cross Table / Contingency Table
    -------------------------------
    
    A contingency table can be defined as “tables arising when observations on a number of categorical variables are cross-classified” (Everitt, 2004, p.89).
    
    There are quite a few variations on the name for this type of table. Perhaps the oldest name is actually contingency table, which was the name Pearson (1904, p. 34) gave to them. Another popular name is cross tabulation (Upton & Cook, 2002, p. 79), but also cross classification table (Zekeck, 2014, p. 71) and bivariate frequency table (Porkess, 1988, p. 48) are used. The one I used cross table which can for example be found in Newbold et al. (2013, p. 9) or Sá (2007, p. 52).

    This function is shown in this [YouTube video](https://youtu.be/3tN6zk3hdco) and the table is also discussed at [PeterStatistics.com](https://peterstatistics.com/Terms/Tables/CrossTable.html)
    
    Parameters
    ----------
    field1 : pandas series
        data with categories for the rows
    field2 : pandas series
        data with categories for the columns
    order1 : list or dictionary, optional
        order for categories of field1
    order2 : list or dictionary, optional
        order for categories of field2
    percent : {None, "all", "row", "column"}, optional
        which percentages to show. Default is None (will show counts)
    totals : {"exclude", "include"}, optional
        add margin totals. Default is "exclude"
    dropEmpty : boolean, optional
        drop rows and/or columns with only zeros. Default is True
        
    Returns
    -------
    dataframe : the cross table
    
    References
    ----------
    Everitt, B. (2004). *The Cambridge dictionary of statistics* (2nd ed.). Cambridge University Press.
    
    Newbold, P., Carlson, W. L., & Thorne, B. (2013). *Statistics for business and economics* (8th ed). Pearson.
    
    Pearson, K. (1904). *Contributions to the Mathematical Theory of Evolution*. XIII. On the theory of contingency and its relation to association and normal correlation. Dulau and Co.
    
    Porkess, R. (1988). *Dictionary of statistics*. Collins.
    
    Sá, J. P. M. de. (2007). *Applied statistics: Using SPSS, Statistica, MATLAB, and R* (2nd ed.). Springer.
    
    Upton, G., & Cook, I. (2002). *Oxford: Dictionary of statistics*. Oxford University Press.
    
    Zedeck, S. (Ed.). (2014). *APA dictionary of statistics and research methods*. American Psychological Association.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #pd.set_option('future.no_silent_downcasting', True)
    
    if type(field1) is list:
        field1 = pd.Series(field1)
    
    if order1 is not None:
        if type(order1) is dict:
            orderx = {y: x for x, y in order1.items()}
            field1 = field1.replace(orderx)
            
        field1 = pd.Categorical(field1, categories=order1, ordered=True)
        
    if type(field2) is list:
        field2 = pd.Series(field2)
    
    if order2 is not None:
        if type(order2) is dict:
            orderx = {y: x for x, y in order2.items()}
            field2 = field2.replace(orderx)
            
        field2 = pd.Categorical(field2, categories=order2, ordered=True)
        
    tab = pd.crosstab(field1, field2, margins=False, dropna=dropEmpty)    
    
    # pandas crosstab does not add margins properly when NaN are involved, so add them on my own
    rowSums = tab.sum(axis=1)
    colSums = tab.sum(axis=0)
    n = sum(rowSums)
    
    if totals=="include":
        tab['Total'] = rowSums    
        tab.loc['Total', :] = colSums        
        tab.at['Total', 'Total'] = n
        colSums.at['Total'] = n
        rowSums.at['Total'] = n
    
    if percent=="all":
        tab = tab.astype(float)
        tab = tab/n *100
    elif percent=="column":
        tab = tab.astype(float)
        for i in range(0,rowSums.shape[0]):
            for j in range(0,colSums.shape[0]):
                tab.iloc[i,j] = float(tab.iloc[i,j]) / colSums.iloc[j]*100
    elif percent=="row":
        tab = tab.astype(float)
        for i in range(0,rowSums.shape[0]):
            for j in range(0,colSums.shape[0]):
                tab.iloc[i,j] = float(tab.iloc[i,j]) / rowSums.iloc[i]*100    
    
    return (tab)