Module `stikpetP.other.poho_pairwise_t`

Expand source code

import pandas as pd
from scipy.stats import t 

def ph_pairwise_t(nomField, scaleField, categories=None):
    '''
    Post-Hoc Pairwise Student T
    ---------------------------
    This function performs pairwise independent samples Student t tests, for use after a one-way ANOVA, to determine which categories significantly differ from each other.
    
    It differs slightly in the calculation of the standard error, than the version used by using ph_pairwise_is(nomField, scaleField, isTest = "student"). This version appears to be producing the same results as SPSS shows, when using a Bonferroni correction. SPSS refers to Winer (1962) for their procedures.
    
    A simple Bonferroni correction is also applied.
    
    Parameters
    ----------
    nomField : pandas series
        data with categories
    scaleField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
        
    Returns
    -------
    A data frame with:
    
    * *category 1*, the first category in the pair
    * *category 2*, the second category in the pair
    * *n1*, sample size of first category
    * *n2*, sample size of second category
    * *mean 1*, arithmetic mean of scores in first category
    * *mean 2*, arithmetic mean of scores in second category
    * *sample diff.*, difference between the two arithmetic means
    * *hyp diff.*, the hypothesized difference
    * *statistic*, the test-statistic
    * *df*, the degrees of freedom
    * *p-value*, the unadjusted p-value (significance)
    * *adj. p-value*, the Bonferroni adjusted p-values
    * *test*, description of test used
    
    Notes
    -----
    The formula used:
    $$t_{1,2} = \\frac{\\bar{x}_1 - \\bar{x}_2}{\\sqrt{MS_w \\times\\left(\\frac{1}{n_1}+ \\frac{1}{n_2}\\right)}}$$
    $$df_w = n - k$$
    $$sig. = 2\\times\\left(1 - T\\left(\\left|t_{1,2}\\right|, df_w\\right)\\right)$$
    
    With:
    $$MS_w = \\frac{SS_w}{df_w}$$
    $$SS_w = \\sum_{j=1}^k \\sum_{i=1}^{n_j} \\left(x_{i,j} - \\bar{x}_j\\right)^2$$
    $$\\bar{x}_j = \\frac{\\sum_{i=1}^{n_j} x_{i,j}}{n_j}$$
    
    *Symbols used*
    
    * \\(x_{i,j}\\), the i-th score in category j
    * \\(n\\), the total sample size
    * \\(n_j\\), the number of scores in category j
    * \\(k\\), the number of categories
    * \\(\\bar{x}_j\\), the mean of the scores in category j
    * \\(MS_w\\), the mean square within
    * \\(SS_w\\), the sum of squares of within (sum of squared deviation of the mean)
    * \\(df_w\\), the degrees of freedom of within
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times n_{comp}, 1\\right)$$
    
    With:
    $$n_{comp} = \\frac{k\\times\\left(k-1\\right)}{2}
    
    Where \\(k\\) is the number of categories.
    
    References
    ----------
    Winer, B. J. (1962). *Statistical principles in experimental design*. McGraw Hill.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    if type(nomField) == list:
        nomField = pd.Series(nomField)
        
    if type(scaleField) == list:
        scaleField = pd.Series(scaleField)
        
    data = pd.concat([nomField, scaleField], axis=1)
    data.columns = ["category", "score"]
    
    #remove unused categories
    if categories is not None:
        data = data[data.category.isin(categories)]
    
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data.reset_index()
    
    cats = pd.unique(data["category"])
    
    k = len(cats)
    ncomp = k * (k - 1) / 2
    
    
    #overall n, mean and ss
    n = len(data["category"])
    m = data.score.mean()
    sst = data.score.var()*(n-1)
    
    #sample sizes, and means per category
    nj = data.groupby('category').count()
    sj = data.groupby('category').sum()
    mj = data.groupby('category').mean()
    
    ssb = float((nj*(mj-m)**2).sum())
    ssw = sst - ssb
    
    dfw = n - k
    msw = ssw/dfw
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow=0
    for i in range(0, k-1):
        for j in range(i+1, k):
            res.at[resRow, 0] = cats[i]
            res.at[resRow, 1] = cats[j]
            res.at[resRow, 2] = nj.iloc[i, 0]
            res.at[resRow, 3] = nj.iloc[j, 0]
            res.at[resRow, 4] = mj.iloc[i, 0]
            res.at[resRow, 5] = mj.iloc[j, 0]
            res.at[resRow, 6] = res.iloc[resRow, 4] - res.iloc[resRow, 5]
            res.at[resRow, 7] = 0
            
            sej = (msw * (1 / res.iloc[resRow, 2] + 1 / res.iloc[resRow, 3]))**0.5
            tVal = res.iloc[resRow, 6]/sej
            res.at[resRow, 8] = tVal
            
            res.at[resRow, 9] = dfw
            pValue = 2*(1-t.cdf(abs(tVal), dfw))
            res.at[resRow, 10] = pValue
            res.at[resRow, 11] = pValue*ncomp
            if res.iloc[resRow,11] > 1:
                res.iloc[resRow,11] = 1
            
            res.at[resRow, 12] = "Winer pairwise t"
            resRow = resRow + 1
    
    res.columns = ["category 1", "category 2", "n1", "n2", "mean 1", "mean 2", "sample diff.", "hyp diff.", "statistic", "df", "p-value", "adj. p-value", "test"]
    return res

Functions

def ph_pairwise_t(nomField, scaleField, categories=None)

Post-Hoc Pairwise Student T

This function performs pairwise independent samples Student t tests, for use after a one-way ANOVA, to determine which categories significantly differ from each other.

It differs slightly in the calculation of the standard error, than the version used by using ph_pairwise_is(nomField, scaleField, isTest = "student"). This version appears to be producing the same results as SPSS shows, when using a Bonferroni correction. SPSS refers to Winer (1962) for their procedures.

A simple Bonferroni correction is also applied.

Parameters

nomField : pandas series: data with categories
scaleField : pandas series: data with the scores
categories : list or dictionary, optional: the categories to use from catField

Returns

A data frame with:

category 1, the first category in the pair
category 2, the second category in the pair
n1, sample size of first category
n2, sample size of second category
mean 1, arithmetic mean of scores in first category
mean 2, arithmetic mean of scores in second category
sample diff., difference between the two arithmetic means
hyp diff., the hypothesized difference
statistic, the test-statistic
df, the degrees of freedom
p-value, the unadjusted p-value (significance)
adj. p-value, the Bonferroni adjusted p-values
test, description of test used

Notes

The formula used: $t_{1,2} = \frac{\bar{x}_1 - \bar{x}_2}{\sqrt{MS_w \times\left(\frac{1}{n_1}+ \frac{1}{n_2}\right)}}$ $df_w = n - k$ $sig. = 2\times\left(1 - T\left(\left|t_{1,2}\right|, df_w\right)\right)$

With: $MS_w = \frac{SS_w}{df_w}$ $SS_w = \sum_{j=1}^k \sum_{i=1}^{n_j} \left(x_{i,j} - \bar{x}_j\right)^2$ $\bar{x}_j = \frac{\sum_{i=1}^{n_j} x_{i,j}}{n_j}$

Symbols used

$x_{i,j}$ , the i-th score in category j
$n$ , the total sample size
$n_j$ , the number of scores in category j
$k$ , the number of categories
$\bar{x}_j$ , the mean of the scores in category j
$MS_w$ , the mean square within
$SS_w$ , the sum of squares of within (sum of squared deviation of the mean)
$df_w$ , the degrees of freedom of within

A simple Bonferroni correction is applied for the multiple comparisons. This is simply: $sig._{adj} = \min \left(sig. \times n_{comp}, 1\right)$

With: $$n_{comp} = \frac{k\times\left(k-1\right)}{2}

Where $k$ is the number of categories.

References

Winer, B. J. (1962). Statistical principles in experimental design. McGraw Hill.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code

def ph_pairwise_t(nomField, scaleField, categories=None):
    '''
    Post-Hoc Pairwise Student T
    ---------------------------
    This function performs pairwise independent samples Student t tests, for use after a one-way ANOVA, to determine which categories significantly differ from each other.
    
    It differs slightly in the calculation of the standard error, than the version used by using ph_pairwise_is(nomField, scaleField, isTest = "student"). This version appears to be producing the same results as SPSS shows, when using a Bonferroni correction. SPSS refers to Winer (1962) for their procedures.
    
    A simple Bonferroni correction is also applied.
    
    Parameters
    ----------
    nomField : pandas series
        data with categories
    scaleField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
        
    Returns
    -------
    A data frame with:
    
    * *category 1*, the first category in the pair
    * *category 2*, the second category in the pair
    * *n1*, sample size of first category
    * *n2*, sample size of second category
    * *mean 1*, arithmetic mean of scores in first category
    * *mean 2*, arithmetic mean of scores in second category
    * *sample diff.*, difference between the two arithmetic means
    * *hyp diff.*, the hypothesized difference
    * *statistic*, the test-statistic
    * *df*, the degrees of freedom
    * *p-value*, the unadjusted p-value (significance)
    * *adj. p-value*, the Bonferroni adjusted p-values
    * *test*, description of test used
    
    Notes
    -----
    The formula used:
    $$t_{1,2} = \\frac{\\bar{x}_1 - \\bar{x}_2}{\\sqrt{MS_w \\times\\left(\\frac{1}{n_1}+ \\frac{1}{n_2}\\right)}}$$
    $$df_w = n - k$$
    $$sig. = 2\\times\\left(1 - T\\left(\\left|t_{1,2}\\right|, df_w\\right)\\right)$$
    
    With:
    $$MS_w = \\frac{SS_w}{df_w}$$
    $$SS_w = \\sum_{j=1}^k \\sum_{i=1}^{n_j} \\left(x_{i,j} - \\bar{x}_j\\right)^2$$
    $$\\bar{x}_j = \\frac{\\sum_{i=1}^{n_j} x_{i,j}}{n_j}$$
    
    *Symbols used*
    
    * \\(x_{i,j}\\), the i-th score in category j
    * \\(n\\), the total sample size
    * \\(n_j\\), the number of scores in category j
    * \\(k\\), the number of categories
    * \\(\\bar{x}_j\\), the mean of the scores in category j
    * \\(MS_w\\), the mean square within
    * \\(SS_w\\), the sum of squares of within (sum of squared deviation of the mean)
    * \\(df_w\\), the degrees of freedom of within
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times n_{comp}, 1\\right)$$
    
    With:
    $$n_{comp} = \\frac{k\\times\\left(k-1\\right)}{2}
    
    Where \\(k\\) is the number of categories.
    
    References
    ----------
    Winer, B. J. (1962). *Statistical principles in experimental design*. McGraw Hill.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    if type(nomField) == list:
        nomField = pd.Series(nomField)
        
    if type(scaleField) == list:
        scaleField = pd.Series(scaleField)
        
    data = pd.concat([nomField, scaleField], axis=1)
    data.columns = ["category", "score"]
    
    #remove unused categories
    if categories is not None:
        data = data[data.category.isin(categories)]
    
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data.reset_index()
    
    cats = pd.unique(data["category"])
    
    k = len(cats)
    ncomp = k * (k - 1) / 2
    
    
    #overall n, mean and ss
    n = len(data["category"])
    m = data.score.mean()
    sst = data.score.var()*(n-1)
    
    #sample sizes, and means per category
    nj = data.groupby('category').count()
    sj = data.groupby('category').sum()
    mj = data.groupby('category').mean()
    
    ssb = float((nj*(mj-m)**2).sum())
    ssw = sst - ssb
    
    dfw = n - k
    msw = ssw/dfw
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow=0
    for i in range(0, k-1):
        for j in range(i+1, k):
            res.at[resRow, 0] = cats[i]
            res.at[resRow, 1] = cats[j]
            res.at[resRow, 2] = nj.iloc[i, 0]
            res.at[resRow, 3] = nj.iloc[j, 0]
            res.at[resRow, 4] = mj.iloc[i, 0]
            res.at[resRow, 5] = mj.iloc[j, 0]
            res.at[resRow, 6] = res.iloc[resRow, 4] - res.iloc[resRow, 5]
            res.at[resRow, 7] = 0
            
            sej = (msw * (1 / res.iloc[resRow, 2] + 1 / res.iloc[resRow, 3]))**0.5
            tVal = res.iloc[resRow, 6]/sej
            res.at[resRow, 8] = tVal
            
            res.at[resRow, 9] = dfw
            pValue = 2*(1-t.cdf(abs(tVal), dfw))
            res.at[resRow, 10] = pValue
            res.at[resRow, 11] = pValue*ncomp
            if res.iloc[resRow,11] > 1:
                res.iloc[resRow,11] = 1
            
            res.at[resRow, 12] = "Winer pairwise t"
            resRow = resRow + 1
    
    res.columns = ["category 1", "category 2", "n1", "n2", "mean 1", "mean 2", "sample diff.", "hyp diff.", "statistic", "df", "p-value", "adj. p-value", "test"]
    return res