Module stikpetP.other.table_cross_bin

Expand source code
import pandas as pd
from ..other.table_frequency_bins import tab_frequency_bins

def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False , **kwargs):
    '''
    Cross Table with Bins
    ---------------------

    This function creates a cross-table of a categorical field, and bins from a scale field.

    Parameters
    ----------
    catField : pandas series or list
        data with categories for the columns
    scaleField : pandas series or list
        data with scores that will be binned
    orderCat : list, optional
        order for categories
    percent : {None, "all", "row", "column"}, optional
        which percentages to show. Default is None (will show counts)
    totals : {"exclude", "include"}, optional
        add margin totals. Default is "exclude"
    fd : bool, optional
        indication to use frequency density, rater than counts. default is False
    kwargs : optional
        other arguments passed to tab_frequency_bins() to set the bins

    Returns
    -------
    dataframe : cross table

    Notes
    -----
    The percentage can be set to either 
    
    * None, then counts or frequency density will be shown
    * "all", then the grand total will be used to determine the percentage
    * "row", the row totals will be used to calculate the percentages
    * "column", the column totals will be used to calculate the percentages

    When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    
    '''
    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    dfr_sub = pd.concat([catField, scaleField], axis=1)
    dfr_sub = dfr_sub.dropna()
    dfr_sub.columns = ['category', 'score']

    # get the bins
    total = tab_frequency_bins(dfr_sub['score'], **kwargs)
    bins_tot = [(total.iloc[i, 0], total.iloc[i,1]) for i in range(len(total))]
    res = total[['lower bound', 'upper bound']]

    # remove bins from kwargs
    kwargs_no_bins = {k: v for k, v in kwargs.items() if k != 'bins'}

    # determine the counts for each category
    if orderCat is None:
        cat_lbls = list(dfr_sub['category'].unique())
    else:
        cat_lbls = orderCat
        
    for i in cat_lbls:
        cat_scores = dfr_sub.iloc[:,1][dfr_sub.iloc[:,0] == i]
        if fd:
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency density']
            f_cat = f_cat.to_frame(name=f'fd_{i}')
        else:            
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency']
            f_cat = f_cat.to_frame(name=f'freq_{i}')
        res = pd.concat([res, f_cat], axis=1)
        
    res = res.reset_index(drop=True)

    if fd:
        col_total_fr = total['frequency density']
    else:
        col_total_fr = total['frequency']

    col_totals = res.iloc[:,2:len(cat_lbls)+2].sum()
    row_totals = res.iloc[:,2:len(cat_lbls)+2].sum(axis=1)
    total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())

    # the row percentages
    if percent=='row':
        # add the row total
        col_total_fr = col_total_fr.to_frame(name='TOTAL')
        res = pd.concat([res, col_total_fr], axis=1)
    
        # the row percentages
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/res['TOTAL']*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names + ['TOTAL']

    # the column percentages
    if percent=='column':
        col_totals = res.sum()
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/col_totals.iloc[i]*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names  

    # overall percent
    if percent=='both':
        total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())
        res.iloc[:,2:len(cat_lbls)+2] = res.iloc[:,2:len(cat_lbls)+2]/total

    # show totals or not
    if totals=='exclude':
        if percent=='row':
            # remove the created rows total column
            res = res.iloc[:, 0:len(cat_lbls)+2]
    elif totals=='include':
        if percent=='row':
            # only need to add the column totals row
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        elif percent=='row' or percent=='column':
            res['TOTAL'] = row_totals/total*100
            if percent=='column':
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + [100]*len(cat_lbls) + [100]
            else:
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        else:
            res['TOTAL'] = row_totals
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals) + [total]
            
    return res

Functions

def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False, **kwargs)

Cross Table With Bins

This function creates a cross-table of a categorical field, and bins from a scale field.

Parameters

catField : pandas series or list
data with categories for the columns
scaleField : pandas series or list
data with scores that will be binned
orderCat : list, optional
order for categories
percent : {None, "all", "row", "column"}, optional
which percentages to show. Default is None (will show counts)
totals : {"exclude", "include"}, optional
add margin totals. Default is "exclude"
fd : bool, optional
indication to use frequency density, rater than counts. default is False
kwargs : optional
other arguments passed to tab_frequency_bins() to set the bins

Returns

dataframe : cross table
 

Notes

The percentage can be set to either

  • None, then counts or frequency density will be shown
  • "all", then the grand total will be used to determine the percentage
  • "row", the row totals will be used to calculate the percentages
  • "column", the column totals will be used to calculate the percentages

When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False , **kwargs):
    '''
    Cross Table with Bins
    ---------------------

    This function creates a cross-table of a categorical field, and bins from a scale field.

    Parameters
    ----------
    catField : pandas series or list
        data with categories for the columns
    scaleField : pandas series or list
        data with scores that will be binned
    orderCat : list, optional
        order for categories
    percent : {None, "all", "row", "column"}, optional
        which percentages to show. Default is None (will show counts)
    totals : {"exclude", "include"}, optional
        add margin totals. Default is "exclude"
    fd : bool, optional
        indication to use frequency density, rater than counts. default is False
    kwargs : optional
        other arguments passed to tab_frequency_bins() to set the bins

    Returns
    -------
    dataframe : cross table

    Notes
    -----
    The percentage can be set to either 
    
    * None, then counts or frequency density will be shown
    * "all", then the grand total will be used to determine the percentage
    * "row", the row totals will be used to calculate the percentages
    * "column", the column totals will be used to calculate the percentages

    When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    
    '''
    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    dfr_sub = pd.concat([catField, scaleField], axis=1)
    dfr_sub = dfr_sub.dropna()
    dfr_sub.columns = ['category', 'score']

    # get the bins
    total = tab_frequency_bins(dfr_sub['score'], **kwargs)
    bins_tot = [(total.iloc[i, 0], total.iloc[i,1]) for i in range(len(total))]
    res = total[['lower bound', 'upper bound']]

    # remove bins from kwargs
    kwargs_no_bins = {k: v for k, v in kwargs.items() if k != 'bins'}

    # determine the counts for each category
    if orderCat is None:
        cat_lbls = list(dfr_sub['category'].unique())
    else:
        cat_lbls = orderCat
        
    for i in cat_lbls:
        cat_scores = dfr_sub.iloc[:,1][dfr_sub.iloc[:,0] == i]
        if fd:
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency density']
            f_cat = f_cat.to_frame(name=f'fd_{i}')
        else:            
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency']
            f_cat = f_cat.to_frame(name=f'freq_{i}')
        res = pd.concat([res, f_cat], axis=1)
        
    res = res.reset_index(drop=True)

    if fd:
        col_total_fr = total['frequency density']
    else:
        col_total_fr = total['frequency']

    col_totals = res.iloc[:,2:len(cat_lbls)+2].sum()
    row_totals = res.iloc[:,2:len(cat_lbls)+2].sum(axis=1)
    total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())

    # the row percentages
    if percent=='row':
        # add the row total
        col_total_fr = col_total_fr.to_frame(name='TOTAL')
        res = pd.concat([res, col_total_fr], axis=1)
    
        # the row percentages
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/res['TOTAL']*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names + ['TOTAL']

    # the column percentages
    if percent=='column':
        col_totals = res.sum()
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/col_totals.iloc[i]*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names  

    # overall percent
    if percent=='both':
        total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())
        res.iloc[:,2:len(cat_lbls)+2] = res.iloc[:,2:len(cat_lbls)+2]/total

    # show totals or not
    if totals=='exclude':
        if percent=='row':
            # remove the created rows total column
            res = res.iloc[:, 0:len(cat_lbls)+2]
    elif totals=='include':
        if percent=='row':
            # only need to add the column totals row
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        elif percent=='row' or percent=='column':
            res['TOTAL'] = row_totals/total*100
            if percent=='column':
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + [100]*len(cat_lbls) + [100]
            else:
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        else:
            res['TOTAL'] = row_totals
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals) + [total]
            
    return res