Module stikpetP.other.table_cross_bin
Expand source code
import pandas as pd
from ..other.table_frequency_bins import tab_frequency_bins
def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False , **kwargs):
'''
Cross Table with Bins
---------------------
This function creates a cross-table of a categorical field, and bins from a scale field.
Parameters
----------
catField : pandas series or list
data with categories for the columns
scaleField : pandas series or list
data with scores that will be binned
orderCat : list, optional
order for categories
percent : {None, "all", "row", "column"}, optional
which percentages to show. Default is None (will show counts)
totals : {"exclude", "include"}, optional
add margin totals. Default is "exclude"
fd : bool, optional
indication to use frequency density, rater than counts. default is False
kwargs : optional
other arguments passed to tab_frequency_bins() to set the bins
Returns
-------
dataframe : cross table
Notes
-----
The percentage can be set to either
* None, then counts or frequency density will be shown
* "all", then the grand total will be used to determine the percentage
* "row", the row totals will be used to calculate the percentages
* "column", the column totals will be used to calculate the percentages
When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.
Author
------
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076
'''
#convert to pandas series if needed
if type(catField) is list:
catField = pd.Series(catField)
if type(scaleField) is list:
scaleField = pd.Series(scaleField)
#combine as one dataframe
dfr_sub = pd.concat([catField, scaleField], axis=1)
dfr_sub = dfr_sub.dropna()
dfr_sub.columns = ['category', 'score']
# get the bins
total = tab_frequency_bins(dfr_sub['score'], **kwargs)
bins_tot = [(total.iloc[i, 0], total.iloc[i,1]) for i in range(len(total))]
res = total[['lower bound', 'upper bound']]
# remove bins from kwargs
kwargs_no_bins = {k: v for k, v in kwargs.items() if k != 'bins'}
# determine the counts for each category
if orderCat is None:
cat_lbls = list(dfr_sub['category'].unique())
else:
cat_lbls = orderCat
for i in cat_lbls:
cat_scores = dfr_sub.iloc[:,1][dfr_sub.iloc[:,0] == i]
if fd:
f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency density']
f_cat = f_cat.to_frame(name=f'fd_{i}')
else:
f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency']
f_cat = f_cat.to_frame(name=f'freq_{i}')
res = pd.concat([res, f_cat], axis=1)
res = res.reset_index(drop=True)
if fd:
col_total_fr = total['frequency density']
else:
col_total_fr = total['frequency']
col_totals = res.iloc[:,2:len(cat_lbls)+2].sum()
row_totals = res.iloc[:,2:len(cat_lbls)+2].sum(axis=1)
total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())
# the row percentages
if percent=='row':
# add the row total
col_total_fr = col_total_fr.to_frame(name='TOTAL')
res = pd.concat([res, col_total_fr], axis=1)
# the row percentages
res_col = list(res.columns)
for i in range(2, len(res_col)):
res[res_col[i]] = res[res_col[i]]/res['TOTAL']*100
# rename the columns
col_names = ['lower bound', 'upper bound']
for i in range(0, len(cat_lbls)):
if fd:
col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
else:
col_names = col_names + [f'perc_{cat_lbls[i]}']
res.columns = col_names + ['TOTAL']
# the column percentages
if percent=='column':
col_totals = res.sum()
res_col = list(res.columns)
for i in range(2, len(res_col)):
res[res_col[i]] = res[res_col[i]]/col_totals.iloc[i]*100
# rename the columns
col_names = ['lower bound', 'upper bound']
for i in range(0, len(cat_lbls)):
if fd:
col_names = col_names + [f'fd_perc_{cat_lbls[i]}']
else:
col_names = col_names + [f'perc_{cat_lbls[i]}']
res.columns = col_names
# overall percent
if percent=='both':
total = sum(res.iloc[:,2:len(cat_lbls)+2].sum())
res.iloc[:,2:len(cat_lbls)+2] = res.iloc[:,2:len(cat_lbls)+2]/total
# show totals or not
if totals=='exclude':
if percent=='row':
# remove the created rows total column
res = res.iloc[:, 0:len(cat_lbls)+2]
elif totals=='include':
if percent=='row':
# only need to add the column totals row
res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
elif percent=='row' or percent=='column':
res['TOTAL'] = row_totals/total*100
if percent=='column':
res.loc[len(res)] = ['TOTAL', 'n.a.'] + [100]*len(cat_lbls) + [100]
else:
res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100]
else:
res['TOTAL'] = row_totals
res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals) + [total]
return res
Functions
def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False, **kwargs)
-
Cross Table With Bins
This function creates a cross-table of a categorical field, and bins from a scale field.
Parameters
catField
:pandas series
orlist
- data with categories for the columns
scaleField
:pandas series
orlist
- data with scores that will be binned
orderCat
:list
, optional- order for categories
percent
:{None, "all", "row", "column"}
, optional- which percentages to show. Default is None (will show counts)
totals
:{"exclude", "include"}
, optional- add margin totals. Default is "exclude"
fd
:bool
, optional- indication to use frequency density, rater than counts. default is False
kwargs
:optional
- other arguments passed to tab_frequency_bins() to set the bins
Returns
dataframe
:cross table
Notes
The percentage can be set to either
- None, then counts or frequency density will be shown
- "all", then the grand total will be used to determine the percentage
- "row", the row totals will be used to calculate the percentages
- "column", the column totals will be used to calculate the percentages
When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total.
Author
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076Expand source code
def tab_cross_bin(catField, scaleField, orderCat=None, percent=None, totals='exclude', fd=False , **kwargs): ''' Cross Table with Bins --------------------- This function creates a cross-table of a categorical field, and bins from a scale field. Parameters ---------- catField : pandas series or list data with categories for the columns scaleField : pandas series or list data with scores that will be binned orderCat : list, optional order for categories percent : {None, "all", "row", "column"}, optional which percentages to show. Default is None (will show counts) totals : {"exclude", "include"}, optional add margin totals. Default is "exclude" fd : bool, optional indication to use frequency density, rater than counts. default is False kwargs : optional other arguments passed to tab_frequency_bins() to set the bins Returns ------- dataframe : cross table Notes ----- The percentage can be set to either * None, then counts or frequency density will be shown * "all", then the grand total will be used to determine the percentage * "row", the row totals will be used to calculate the percentages * "column", the column totals will be used to calculate the percentages When including totals, the row or column percentage will also be calculated based on the total count/freq. dens. of the one that is not used for the total. Author ------ Made by P. Stikker Companion website: https://PeterStatistics.com YouTube channel: https://www.youtube.com/stikpet Donations: https://www.patreon.com/bePatron?u=19398076 ''' #convert to pandas series if needed if type(catField) is list: catField = pd.Series(catField) if type(scaleField) is list: scaleField = pd.Series(scaleField) #combine as one dataframe dfr_sub = pd.concat([catField, scaleField], axis=1) dfr_sub = dfr_sub.dropna() dfr_sub.columns = ['category', 'score'] # get the bins total = tab_frequency_bins(dfr_sub['score'], **kwargs) bins_tot = [(total.iloc[i, 0], total.iloc[i,1]) for i in range(len(total))] res = total[['lower bound', 'upper bound']] # remove bins from kwargs kwargs_no_bins = {k: v for k, v in kwargs.items() if k != 'bins'} # determine the counts for each category if orderCat is None: cat_lbls = list(dfr_sub['category'].unique()) else: cat_lbls = orderCat for i in cat_lbls: cat_scores = dfr_sub.iloc[:,1][dfr_sub.iloc[:,0] == i] if fd: f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency density'] f_cat = f_cat.to_frame(name=f'fd_{i}') else: f_cat = tab_frequency_bins(cat_scores, bins=bins_tot, **kwargs_no_bins)['frequency'] f_cat = f_cat.to_frame(name=f'freq_{i}') res = pd.concat([res, f_cat], axis=1) res = res.reset_index(drop=True) if fd: col_total_fr = total['frequency density'] else: col_total_fr = total['frequency'] col_totals = res.iloc[:,2:len(cat_lbls)+2].sum() row_totals = res.iloc[:,2:len(cat_lbls)+2].sum(axis=1) total = sum(res.iloc[:,2:len(cat_lbls)+2].sum()) # the row percentages if percent=='row': # add the row total col_total_fr = col_total_fr.to_frame(name='TOTAL') res = pd.concat([res, col_total_fr], axis=1) # the row percentages res_col = list(res.columns) for i in range(2, len(res_col)): res[res_col[i]] = res[res_col[i]]/res['TOTAL']*100 # rename the columns col_names = ['lower bound', 'upper bound'] for i in range(0, len(cat_lbls)): if fd: col_names = col_names + [f'fd_perc_{cat_lbls[i]}'] else: col_names = col_names + [f'perc_{cat_lbls[i]}'] res.columns = col_names + ['TOTAL'] # the column percentages if percent=='column': col_totals = res.sum() res_col = list(res.columns) for i in range(2, len(res_col)): res[res_col[i]] = res[res_col[i]]/col_totals.iloc[i]*100 # rename the columns col_names = ['lower bound', 'upper bound'] for i in range(0, len(cat_lbls)): if fd: col_names = col_names + [f'fd_perc_{cat_lbls[i]}'] else: col_names = col_names + [f'perc_{cat_lbls[i]}'] res.columns = col_names # overall percent if percent=='both': total = sum(res.iloc[:,2:len(cat_lbls)+2].sum()) res.iloc[:,2:len(cat_lbls)+2] = res.iloc[:,2:len(cat_lbls)+2]/total # show totals or not if totals=='exclude': if percent=='row': # remove the created rows total column res = res.iloc[:, 0:len(cat_lbls)+2] elif totals=='include': if percent=='row': # only need to add the column totals row res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100] elif percent=='row' or percent=='column': res['TOTAL'] = row_totals/total*100 if percent=='column': res.loc[len(res)] = ['TOTAL', 'n.a.'] + [100]*len(cat_lbls) + [100] else: res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals/total*100) + [100] else: res['TOTAL'] = row_totals res.loc[len(res)] = ['TOTAL', 'n.a.'] + list(col_totals) + [total] return res