Module `stikpetP.other.table_cross_bin`

Expand source code

import pandas as pd
from ..other.table_frequency_bins import tab_frequency_bins

def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False , **kwargs):
    '''
    Cross Table with Bins
    ---------------------

    This function creates a cross-table of a categorical field, and bins from a scale field.

    Parameters
    ----------
    catField : pandas series or list
        data with categories for the columns
    scaleField : pandas series or list
        data with scores that will be binned
    orderCat : list, optional
        order for categories
    percent : {None, "all", "row", "column"}, optional
        which percentages to show. Default is None (will show counts)
    totals : {"exclude", "include"}, optional
        add margin totals. Default is "exclude"
    fd : bool, optional
        indication to use frequency density, rater than counts. default is False
    kwargs : optional
        other arguments passed to tab_frequency_bins() to set the bins

    Returns
    -------
    dataframe : cross table

    Notes
    -----
    The percentage can be set to either 
    
    * None, then counts or frequency density will be shown
    * "all", then the grand total will be used to determine the percentage
    * "row", the row totals will be used to calculate the percentages
    * "column", the column totals will be used to calculate the percentages

    When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    
    '''
    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    dfr_sub = pd.concat([catField, scaleField], axis=1)
    dfr_sub = dfr_sub.dropna()
    dfr_sub.columns = ['category', 'score']

    # get the bins
    total = tab_frequency_bins(dfr_sub['score'], **kwargs)
    bins_tot = [(total.iloc[i, 0], total.iloc[i,1]) for i in range(len(total))]
    res = total[['lower bound', 'upper bound']]

    # remove bins from kwargs
    kwargs_no_bins = {k: v for k, v in kwargs.items() if k != 'bins'}

    # determine the counts for each category
    if orderCat is None:
        cat_lbls = list(dfr_sub['category'].unique())
    else:
        cat_lbls = orderCat
        
    for i in cat_lbls:
        cat_scores = dfr_sub.iloc[:,1][dfr_sub.iloc[:,0] == i]
        if fd:
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency density']
            f_cat = f_cat.to_frame(name=f'fd_{i}')
        else:            
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency']
            f_cat = f_cat.to_frame(name=f'freq_{i}')
        res = pd.concat([res, f_cat], axis=1)
        
    res = res.reset_index(drop=True)

    if fd:
        col_total_fr = total['frequency density']
    else:
        col_total_fr = total['frequency']

    col_totals = res.iloc[:,2:len(cat_lbls)+2].sum()
    row_totals = res.iloc[:,2:len(cat_lbls)+2].sum(axis=1)
    total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())

    # the row percentages
    if percent=='row':
        # add the row total
        col_total_fr = col_total_fr.to_frame(name='TOTAL')
        res = pd.concat([res, col_total_fr], axis=1)
    
        # the row percentages
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/res['TOTAL']*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names + ['TOTAL']

    # the column percentages
    if percent=='column':
        col_totals = res.sum()
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/col_totals.iloc[i]*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names  

    # overall percent
    if percent=='both':
        total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())
        res.iloc[:,2:len(cat_lbls)+2] = res.iloc[:,2:len(cat_lbls)+2]/total

    # show totals or not
    if totals=='exclude':
        if percent=='row':
            # remove the created rows total column
            res = res.iloc[:, 0:len(cat_lbls)+2]
    elif totals=='include':
        if percent=='row':
            # only need to add the column totals row
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        elif percent=='row' or percent=='column':
            res['TOTAL'] = row_totals/total*100
            if percent=='column':
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + [100]*len(cat_lbls) + [100]
            else:
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        else:
            res['TOTAL'] = row_totals
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals) + [total]
            
    return res

Functions

def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False, **kwargs)

Cross Table With Bins

This function creates a cross-table of a categorical field, and bins from a scale field.

Parameters

catField : pandas series or list: data with categories for the columns
scaleField : pandas series or list: data with scores that will be binned
orderCat : list, optional: order for categories
percent : {None, "all", "row", "column"}, optional: which percentages to show. Default is None (will show counts)
totals : {"exclude", "include"}, optional: add margin totals. Default is "exclude"
fd : bool, optional: indication to use frequency density, rater than counts. default is False
kwargs : optional: other arguments passed to tab_frequency_bins() to set the bins

Returns

dataframe : cross table

Notes

The percentage can be set to either

None, then counts or frequency density will be shown
"all", then the grand total will be used to determine the percentage
"row", the row totals will be used to calculate the percentages
"column", the column totals will be used to calculate the percentages

When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False , **kwargs):
    '''
    Cross Table with Bins
    ---------------------

    This function creates a cross-table of a categorical field, and bins from a scale field.

    Parameters
    ----------
    catField : pandas series or list
        data with categories for the columns
    scaleField : pandas series or list
        data with scores that will be binned
    orderCat : list, optional
        order for categories
    percent : {None, "all", "row", "column"}, optional
        which percentages to show. Default is None (will show counts)
    totals : {"exclude", "include"}, optional
        add margin totals. Default is "exclude"
    fd : bool, optional
        indication to use frequency density, rater than counts. default is False
    kwargs : optional
        other arguments passed to tab_frequency_bins() to set the bins

    Returns
    -------
    dataframe : cross table

    Notes
    -----
    The percentage can be set to either 
    
    * None, then counts or frequency density will be shown
    * "all", then the grand total will be used to determine the percentage
    * "row", the row totals will be used to calculate the percentages
    * "column", the column totals will be used to calculate the percentages

    When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    
    '''
    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    dfr_sub = pd.concat([catField, scaleField], axis=1)
    dfr_sub = dfr_sub.dropna()
    dfr_sub.columns = ['category', 'score']

    # get the bins
    total = tab_frequency_bins(dfr_sub['score'], **kwargs)
    bins_tot = [(total.iloc[i, 0], total.iloc[i,1]) for i in range(len(total))]
    res = total[['lower bound', 'upper bound']]

    # remove bins from kwargs
    kwargs_no_bins = {k: v for k, v in kwargs.items() if k != 'bins'}

    # determine the counts for each category
    if orderCat is None:
        cat_lbls = list(dfr_sub['category'].unique())
    else:
        cat_lbls = orderCat
        
    for i in cat_lbls:
        cat_scores = dfr_sub.iloc[:,1][dfr_sub.iloc[:,0] == i]
        if fd:
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency density']
            f_cat = f_cat.to_frame(name=f'fd_{i}')
        else:            
            f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency']
            f_cat = f_cat.to_frame(name=f'freq_{i}')
        res = pd.concat([res, f_cat], axis=1)
        
    res = res.reset_index(drop=True)

    if fd:
        col_total_fr = total['frequency density']
    else:
        col_total_fr = total['frequency']

    col_totals = res.iloc[:,2:len(cat_lbls)+2].sum()
    row_totals = res.iloc[:,2:len(cat_lbls)+2].sum(axis=1)
    total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())

    # the row percentages
    if percent=='row':
        # add the row total
        col_total_fr = col_total_fr.to_frame(name='TOTAL')
        res = pd.concat([res, col_total_fr], axis=1)
    
        # the row percentages
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/res['TOTAL']*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names + ['TOTAL']

    # the column percentages
    if percent=='column':
        col_totals = res.sum()
        res_col = list(res.columns)
        for i in range(2, len(res_col)):
            res[res_col[i]] = res[res_col[i]]/col_totals.iloc[i]*100
        # rename the columns
        col_names = ['lower bound', 'upper bound']
        for i in range(0, len(cat_lbls)):
            if fd:
                col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
            else:
                col_names = col_names + [f'perc_{cat_lbls[i]}']
        res.columns = col_names  

    # overall percent
    if percent=='both':
        total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())
        res.iloc[:,2:len(cat_lbls)+2] = res.iloc[:,2:len(cat_lbls)+2]/total

    # show totals or not
    if totals=='exclude':
        if percent=='row':
            # remove the created rows total column
            res = res.iloc[:, 0:len(cat_lbls)+2]
    elif totals=='include':
        if percent=='row':
            # only need to add the column totals row
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        elif percent=='row' or percent=='column':
            res['TOTAL'] = row_totals/total*100
            if percent=='column':
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + [100]*len(cat_lbls) + [100]
            else:
                res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
        else:
            res['TOTAL'] = row_totals
            res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals) + [total]
            
    return res