Module stikpetP.other.poho_pairwise_ass
Expand source code
import pandas as pd
from ..tests.test_freeman_tukey_ind import ts_freeman_tukey_ind
from ..tests.test_pearson_ind import ts_pearson_ind
from ..tests.test_g_ind import ts_g_ind
from ..tests.test_neyman_ind import ts_neyman_ind
from ..tests.test_mod_log_likelihood_ind import ts_mod_log_likelihood_ind
from ..tests.test_powerdivergence_ind import ts_powerdivergence_ind
from ..tests.test_fisher import ts_fisher
from ..other.table_cross import tab_cross
def ph_pairwise_ass(field1, field2, categories1=None, categories2=None, test="pearson", collapse=None, **kwargs):
'''
Post-Hoc Pairwise Nominal Association Tests
-------------------------------------
This post-hoc test collapses a contingency table to all possible 2x2 sub-tables. It can then perform any of the nominal vs nominal or binary vs binary tests: Pearson chi-square, G, Mod-Log Likelihood, Power Divergence, Freeman-Tukey, Neyman, or Fisher.
Parameters
----------
field1 : pandas series
data of the first field
field2 : pandas series
data of the second field
categories1 : list or dictionary, optional
the categories to use from field1
categories2 : list or dictionary, optional
the categories to use from field2
test : {"pearson", "g", "freeman-tukey", "neyman", "mod-log", "pd", "fisher"}, optional
the test to use
collapse : {None, "both", "field1", "field2"} : string, optional
when selecting a row or column compare to all other rows/columns, or all other individual rows/columns
**kwargs : various, optional
other arguments to pass on for the specific test used.
Returns
-------
A dataframe with with all the test results and an adjusted p-value using a Bonferroni correction.
Notes
-----
With the *collapse* parameter it is possible to choose how to create the 2x2 tables from a nxk table
* *None*, will choose every possible combination of 2 rows and 2 columns
* *field1*, will choose every possible pair of field2 categories and compare with one category from field1 and combine all other categories from field1.
* *field2*, will choose every possible pair of field1 categories and compare with one category from field2 and combine all other categories from field2.
The tests that can be used are:
* *pearson*, performs a Pearson chi-square test of independence (see ts_pearson_ind())
* *g*, performs a G (Likelihood Ratio / Wilks) test of independence (see ts_g_ind())
* *freeman-tukey*, performs a Freeman-Tukey test of independence (see ts_freeman_tukey_ind())
* *neyman*, performs a Neyman test of independence (see ts_neyman_ind())
* *mod-log*, performs a Mod-Log Likelihood test of independence (see ts_mod_log_likelihood_ind())
* *pd*, performs a Power Divergence test of independence (see ts_powerdivergence_ind())
* *fisher*, performs a Fisher Exact test of independence (see ts_fisher())
The Bonferroni adjustment is simply:
$$p_{adj} = \\min \\left(p \\times n_{comp}, 1\\right)$$
$$n_{comp} = \\frac{k\\times\\left(k-1\\right)}{2}$$
*Symbols used:*
* \\(n_{comp}\\), number of comparisons (pairs)
* \\(k\\), number of categories
Author
------
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076
'''
if type(field1) is list:
field1 = pd.Series(field1)
if type(field2) is list:
field2 = pd.Series(field2)
res = pd.DataFrame()
if collapse in ["both", 'field1', 'field2']:
#combine as one dataframe
df = pd.concat([field1, field2], axis=1)
df = df.dropna()
#only use given categories
if categories1 is not None:
df = df[df.iloc[:, 0].isin(categories1)]
if categories2 is not None:
df = df[df.iloc[:, 1].isin(categories2)]
#get the unique categories
cats1 = list(set(df.iloc[:, 0]))
cats2 = list(set(df.iloc[:, 1]))
#number of categories
n_rows = len(cats1)
n_cols = len(cats2)
for i in range(0, n_rows):
for j in range(0, n_cols):
if collapse=="both" or collapse=="field1":
df['cat1 collapsed'] = df.iloc[:, 0].apply(lambda x: 'other' if x != cats1[i] else x)
if collapse=="both" or collapse=="field2":
df['cat2 collapsed'] = df.iloc[:, 1].apply(lambda x: 'other' if x != cats2[j] else x)
if collapse=="field1":
for j2 in range(j+1, n_cols):
if j2 < n_cols:
selCats2 = [cats2[j], cats2[j2]]
if test=="pearson":
tst_res = ts_pearson_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs)
elif test=="g":
tst_res = ts_g_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs)
elif test=="freeman-tukey":
tst_res = ts_freeman_tukey_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs)
elif test=="neyman":
tst_res = ts_neyman_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs)
elif test=="mod-log":
tst_res = ts_mod_log_likelihood_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs)
elif test=="pd":
tst_res = ts_powerdivergence_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs)
elif test=="fisher":
tst_res = pd.DataFrame({'p-value':[ts_fisher(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2)]})
# Create a new DataFrame with the results
new_row = pd.DataFrame({
0: [cats1[i]],
1: ['not ' + cats1[i]],
2: [cats2[j]],
3: [cats2[j2]]
})
# Concatenate the test result to the new row DataFrame
new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1)
# Append the new row to the result DataFrame
res = pd.concat([res, new_row], ignore_index=True)
elif collapse=="field2":
for i2 in range(i+1, n_rows):
if i2 < n_rows:
selCats1 = [cats1[i], cats1[i2]]
if test=="pearson":
tst_res = ts_pearson_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs)
elif test=="g":
tst_res = ts_g_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs)
elif test=="freeman-tukey":
tst_res = ts_freeman_tukey_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs)
elif test=="neyman":
tst_res = ts_neyman_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs)
elif test=="mod-log":
tst_res = ts_mod_log_likelihood_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs)
elif test=="pd":
tst_res = ts_powerdivergence_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs)
elif test=="fisher":
tst_res = pd.DataFrame({'p-value':[ts_fisher(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1)]})
# Create a new DataFrame with the results
new_row = pd.DataFrame({
0: [cats1[i]],
1: [cats1[i2]],
2: [cats2[j]],
3: ['not ' + cats2[j]]
})
# Concatenate the test result to the new row DataFrame
new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1)
# Append the new row to the result DataFrame
res = pd.concat([res, new_row], ignore_index=True)
else:
if test=="pearson":
tst_res = ts_pearson_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs)
elif test=="g":
tst_res = ts_g_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs)
elif test=="freeman-tukey":
tst_res = ts_freeman_tukey_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs)
elif test=="neyman":
tst_res = ts_neyman_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs)
elif test=="mod-log":
tst_res = ts_mod_log_likelihood_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs)
elif test=="pd":
tst_res = ts_powerdivergence_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs)
elif test=="fisher":
tst_res = pd.DataFrame({'p-value':[ts_fisher(df['cat1 collapsed'], df['cat2 collapsed'])]})
# Create a new DataFrame with the results
new_row = pd.DataFrame({
0: [cats1[i]],
1: ['not ' + cats1[i]],
2: [cats2[j]],
3: ['not ' + cats2[j]]
})
# Concatenate the test result to the new row DataFrame
new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1)
# Append the new row to the result DataFrame
res = pd.concat([res, new_row], ignore_index=True)
else:
#create the cross table
ct = tab_cross(field1, field2, order1=categories1, order2=categories2, totals="exclude")
n_rows = ct.shape[0]
n_cols = ct.shape[1]
for cat1_1 in range(0, n_rows-1):
for cat1_2 in range(cat1_1+1, n_rows):
for cat2_1 in range(0, n_cols-1):
for cat2_2 in range(cat2_1+1, n_cols):
selCats1 = [ct.index[cat1_1], ct.index[cat1_2]]
selCats2 = [ct.columns[cat2_1], ct.columns[cat2_2]]
if test=="pearson":
tst_res = ts_pearson_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs)
elif test=="g":
tst_res = ts_g_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs)
elif test=="freeman-tukey":
tst_res = ts_freeman_tukey_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs)
elif test=="neyman":
tst_res = ts_neyman_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs)
elif test=="mod-log":
tst_res = ts_mod_log_likelihood_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs)
elif test=="pd":
tst_res = ts_powerdivergence_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs)
elif test=="fisher":
tst_res = pd.DataFrame({'p-value':[ts_fisher(field1, field2, categories1=selCats1, categories2=selCats2)]})
# Create a new DataFrame with the results
new_row = pd.DataFrame({
0: [ct.index[cat1_1]],
1: [ct.index[cat1_2]],
2: [ct.columns[cat2_1]],
3: [ct.columns[cat2_2]]
})
# Concatenate the test result to the new row DataFrame
new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1)
# Append the new row to the result DataFrame
res = pd.concat([res, new_row], ignore_index=True)
res.columns = ['field1 cat. 1', 'field1 cat. 1', 'field2 cat. 1', 'field2 cat 2'] + res.columns[4:].tolist()
#Bonferroni correction
res['adj. p-value'] = res['p-value']*len(res)
res.loc[res['adj. p-value']>1, 'adj. p-value'] = 1
return res
Functions
def ph_pairwise_ass(field1, field2, categories1=None, categories2=None, test='pearson', collapse=None, **kwargs)-
Post-Hoc Pairwise Nominal Association Tests
This post-hoc test collapses a contingency table to all possible 2x2 sub-tables. It can then perform any of the nominal vs nominal or binary vs binary tests: Pearson chi-square, G, Mod-Log Likelihood, Power Divergence, Freeman-Tukey, Neyman, or Fisher.
Parameters
field1:pandas series- data of the first field
field2:pandas series- data of the second field
categories1:listordictionary, optional- the categories to use from field1
categories2:listordictionary, optional- the categories to use from field2
test:{"pearson", "g", "freeman-tukey", "neyman", "mod-log", "pd", "fisher"}, optional- the test to use
collapse:{None, "both", "field1", "field2"} : string, optional- when selecting a row or column compare to all other rows/columns, or all other individual rows/columns
**kwargs:various, optional- other arguments to pass on for the specific test used.
Returns
A dataframe with with all the test results and an adjusted p-value using a Bonferroni correction.
Notes
With the collapse parameter it is possible to choose how to create the 2x2 tables from a nxk table
- None, will choose every possible combination of 2 rows and 2 columns
- field1, will choose every possible pair of field2 categories and compare with one category from field1 and combine all other categories from field1.
- field2, will choose every possible pair of field1 categories and compare with one category from field2 and combine all other categories from field2.
The tests that can be used are:
- pearson, performs a Pearson chi-square test of independence (see ts_pearson_ind())
- g, performs a G (Likelihood Ratio / Wilks) test of independence (see ts_g_ind())
- freeman-tukey, performs a Freeman-Tukey test of independence (see ts_freeman_tukey_ind())
- neyman, performs a Neyman test of independence (see ts_neyman_ind())
- mod-log, performs a Mod-Log Likelihood test of independence (see ts_mod_log_likelihood_ind())
- pd, performs a Power Divergence test of independence (see ts_powerdivergence_ind())
- fisher, performs a Fisher Exact test of independence (see ts_fisher())
The Bonferroni adjustment is simply: p_{adj} = \min \left(p \times n_{comp}, 1\right) n_{comp} = \frac{k\times\left(k-1\right)}{2}
Symbols used:
- n_{comp}, number of comparisons (pairs)
- k, number of categories
Author
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076Expand source code
def ph_pairwise_ass(field1, field2, categories1=None, categories2=None, test="pearson", collapse=None, **kwargs): ''' Post-Hoc Pairwise Nominal Association Tests ------------------------------------- This post-hoc test collapses a contingency table to all possible 2x2 sub-tables. It can then perform any of the nominal vs nominal or binary vs binary tests: Pearson chi-square, G, Mod-Log Likelihood, Power Divergence, Freeman-Tukey, Neyman, or Fisher. Parameters ---------- field1 : pandas series data of the first field field2 : pandas series data of the second field categories1 : list or dictionary, optional the categories to use from field1 categories2 : list or dictionary, optional the categories to use from field2 test : {"pearson", "g", "freeman-tukey", "neyman", "mod-log", "pd", "fisher"}, optional the test to use collapse : {None, "both", "field1", "field2"} : string, optional when selecting a row or column compare to all other rows/columns, or all other individual rows/columns **kwargs : various, optional other arguments to pass on for the specific test used. Returns ------- A dataframe with with all the test results and an adjusted p-value using a Bonferroni correction. Notes ----- With the *collapse* parameter it is possible to choose how to create the 2x2 tables from a nxk table * *None*, will choose every possible combination of 2 rows and 2 columns * *field1*, will choose every possible pair of field2 categories and compare with one category from field1 and combine all other categories from field1. * *field2*, will choose every possible pair of field1 categories and compare with one category from field2 and combine all other categories from field2. The tests that can be used are: * *pearson*, performs a Pearson chi-square test of independence (see ts_pearson_ind()) * *g*, performs a G (Likelihood Ratio / Wilks) test of independence (see ts_g_ind()) * *freeman-tukey*, performs a Freeman-Tukey test of independence (see ts_freeman_tukey_ind()) * *neyman*, performs a Neyman test of independence (see ts_neyman_ind()) * *mod-log*, performs a Mod-Log Likelihood test of independence (see ts_mod_log_likelihood_ind()) * *pd*, performs a Power Divergence test of independence (see ts_powerdivergence_ind()) * *fisher*, performs a Fisher Exact test of independence (see ts_fisher()) The Bonferroni adjustment is simply: $$p_{adj} = \\min \\left(p \\times n_{comp}, 1\\right)$$ $$n_{comp} = \\frac{k\\times\\left(k-1\\right)}{2}$$ *Symbols used:* * \\(n_{comp}\\), number of comparisons (pairs) * \\(k\\), number of categories Author ------ Made by P. Stikker Companion website: https://PeterStatistics.com YouTube channel: https://www.youtube.com/stikpet Donations: https://www.patreon.com/bePatron?u=19398076 ''' if type(field1) is list: field1 = pd.Series(field1) if type(field2) is list: field2 = pd.Series(field2) res = pd.DataFrame() if collapse in ["both", 'field1', 'field2']: #combine as one dataframe df = pd.concat([field1, field2], axis=1) df = df.dropna() #only use given categories if categories1 is not None: df = df[df.iloc[:, 0].isin(categories1)] if categories2 is not None: df = df[df.iloc[:, 1].isin(categories2)] #get the unique categories cats1 = list(set(df.iloc[:, 0])) cats2 = list(set(df.iloc[:, 1])) #number of categories n_rows = len(cats1) n_cols = len(cats2) for i in range(0, n_rows): for j in range(0, n_cols): if collapse=="both" or collapse=="field1": df['cat1 collapsed'] = df.iloc[:, 0].apply(lambda x: 'other' if x != cats1[i] else x) if collapse=="both" or collapse=="field2": df['cat2 collapsed'] = df.iloc[:, 1].apply(lambda x: 'other' if x != cats2[j] else x) if collapse=="field1": for j2 in range(j+1, n_cols): if j2 < n_cols: selCats2 = [cats2[j], cats2[j2]] if test=="pearson": tst_res = ts_pearson_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs) elif test=="g": tst_res = ts_g_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs) elif test=="freeman-tukey": tst_res = ts_freeman_tukey_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs) elif test=="neyman": tst_res = ts_neyman_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs) elif test=="mod-log": tst_res = ts_mod_log_likelihood_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs) elif test=="pd": tst_res = ts_powerdivergence_ind(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2, **kwargs) elif test=="fisher": tst_res = pd.DataFrame({'p-value':[ts_fisher(df['cat1 collapsed'], df.iloc[:, 1], categories2=selCats2)]}) # Create a new DataFrame with the results new_row = pd.DataFrame({ 0: [cats1[i]], 1: ['not ' + cats1[i]], 2: [cats2[j]], 3: [cats2[j2]] }) # Concatenate the test result to the new row DataFrame new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1) # Append the new row to the result DataFrame res = pd.concat([res, new_row], ignore_index=True) elif collapse=="field2": for i2 in range(i+1, n_rows): if i2 < n_rows: selCats1 = [cats1[i], cats1[i2]] if test=="pearson": tst_res = ts_pearson_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs) elif test=="g": tst_res = ts_g_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs) elif test=="freeman-tukey": tst_res = ts_freeman_tukey_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs) elif test=="neyman": tst_res = ts_neyman_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs) elif test=="mod-log": tst_res = ts_mod_log_likelihood_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs) elif test=="pd": tst_res = ts_powerdivergence_ind(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1, **kwargs) elif test=="fisher": tst_res = pd.DataFrame({'p-value':[ts_fisher(df.iloc[:, 0], df['cat2 collapsed'], categories1=selCats1)]}) # Create a new DataFrame with the results new_row = pd.DataFrame({ 0: [cats1[i]], 1: [cats1[i2]], 2: [cats2[j]], 3: ['not ' + cats2[j]] }) # Concatenate the test result to the new row DataFrame new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1) # Append the new row to the result DataFrame res = pd.concat([res, new_row], ignore_index=True) else: if test=="pearson": tst_res = ts_pearson_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs) elif test=="g": tst_res = ts_g_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs) elif test=="freeman-tukey": tst_res = ts_freeman_tukey_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs) elif test=="neyman": tst_res = ts_neyman_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs) elif test=="mod-log": tst_res = ts_mod_log_likelihood_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs) elif test=="pd": tst_res = ts_powerdivergence_ind(df['cat1 collapsed'], df['cat2 collapsed'], **kwargs) elif test=="fisher": tst_res = pd.DataFrame({'p-value':[ts_fisher(df['cat1 collapsed'], df['cat2 collapsed'])]}) # Create a new DataFrame with the results new_row = pd.DataFrame({ 0: [cats1[i]], 1: ['not ' + cats1[i]], 2: [cats2[j]], 3: ['not ' + cats2[j]] }) # Concatenate the test result to the new row DataFrame new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1) # Append the new row to the result DataFrame res = pd.concat([res, new_row], ignore_index=True) else: #create the cross table ct = tab_cross(field1, field2, order1=categories1, order2=categories2, totals="exclude") n_rows = ct.shape[0] n_cols = ct.shape[1] for cat1_1 in range(0, n_rows-1): for cat1_2 in range(cat1_1+1, n_rows): for cat2_1 in range(0, n_cols-1): for cat2_2 in range(cat2_1+1, n_cols): selCats1 = [ct.index[cat1_1], ct.index[cat1_2]] selCats2 = [ct.columns[cat2_1], ct.columns[cat2_2]] if test=="pearson": tst_res = ts_pearson_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs) elif test=="g": tst_res = ts_g_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs) elif test=="freeman-tukey": tst_res = ts_freeman_tukey_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs) elif test=="neyman": tst_res = ts_neyman_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs) elif test=="mod-log": tst_res = ts_mod_log_likelihood_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs) elif test=="pd": tst_res = ts_powerdivergence_ind(field1, field2, categories1=selCats1, categories2=selCats2, **kwargs) elif test=="fisher": tst_res = pd.DataFrame({'p-value':[ts_fisher(field1, field2, categories1=selCats1, categories2=selCats2)]}) # Create a new DataFrame with the results new_row = pd.DataFrame({ 0: [ct.index[cat1_1]], 1: [ct.index[cat1_2]], 2: [ct.columns[cat2_1]], 3: [ct.columns[cat2_2]] }) # Concatenate the test result to the new row DataFrame new_row = pd.concat([new_row, tst_res.reset_index(drop=True)], axis=1) # Append the new row to the result DataFrame res = pd.concat([res, new_row], ignore_index=True) res.columns = ['field1 cat. 1', 'field1 cat. 1', 'field2 cat. 1', 'field2 cat 2'] + res.columns[4:].tolist() #Bonferroni correction res['adj. p-value'] = res['p-value']*len(res) res.loc[res['adj. p-value']>1, 'adj. p-value'] = 1 return res