Module stikpetP.other.poho_pairwise_t

Expand source code
import pandas as pd
from scipy.stats import t 

def ph_pairwise_t(nomField, scaleField, categories=None):
    '''
    Post-Hoc Pairwise Student T
    ---------------------------
    This function performs pairwise independent samples Student t tests, for use after a one-way ANOVA, to determine which categories significantly differ from each other.
    
    It differs slightly in the calculation of the standard error, than the version used by using ph_pairwise_is(nomField, scaleField, isTest = "student"). This version appears to be producing the same results as SPSS shows, when using a Bonferroni correction. SPSS refers to Winer (1962) for their procedures.
    
    A simple Bonferroni correction is also applied.
    
    Parameters
    ----------
    nomField : pandas series
        data with categories
    scaleField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
        
    Returns
    -------
    A data frame with:
    
    * *category 1*, the first category in the pair
    * *category 2*, the second category in the pair
    * *n1*, sample size of first category
    * *n2*, sample size of second category
    * *mean 1*, arithmetic mean of scores in first category
    * *mean 2*, arithmetic mean of scores in second category
    * *sample diff.*, difference between the two arithmetic means
    * *hyp diff.*, the hypothesized difference
    * *statistic*, the test-statistic
    * *df*, the degrees of freedom
    * *p-value*, the unadjusted p-value (significance)
    * *adj. p-value*, the Bonferroni adjusted p-values
    * *test*, description of test used
    
    Notes
    -----
    The formula used:
    $$t_{1,2} = \\frac{\\bar{x}_1 - \\bar{x}_2}{\\sqrt{MS_w \\times\\left(\\frac{1}{n_1}+ \\frac{1}{n_2}\\right)}}$$
    $$df_w = n - k$$
    $$sig. = 2\\times\\left(1 - T\\left(\\left|t_{1,2}\\right|, df_w\\right)\\right)$$
    
    With:
    $$MS_w = \\frac{SS_w}{df_w}$$
    $$SS_w = \\sum_{j=1}^k \\sum_{i=1}^{n_j} \\left(x_{i,j} - \\bar{x}_j\\right)^2$$
    $$\\bar{x}_j = \\frac{\\sum_{i=1}^{n_j} x_{i,j}}{n_j}$$
    
    *Symbols used*
    
    * \\(x_{i,j}\\), the i-th score in category j
    * \\(n\\), the total sample size
    * \\(n_j\\), the number of scores in category j
    * \\(k\\), the number of categories
    * \\(\\bar{x}_j\\), the mean of the scores in category j
    * \\(MS_w\\), the mean square within
    * \\(SS_w\\), the sum of squares of within (sum of squared deviation of the mean)
    * \\(df_w\\), the degrees of freedom of within
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times n_{comp}, 1\\right)$$
    
    With:
    $$n_{comp} = \\frac{k\\times\\left(k-1\\right)}{2}
    
    Where \\(k\\) is the number of categories.
    
    References
    ----------
    Winer, B. J. (1962). *Statistical principles in experimental design*. McGraw Hill.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    if type(nomField) == list:
        nomField = pd.Series(nomField)
        
    if type(scaleField) == list:
        scaleField = pd.Series(scaleField)
        
    data = pd.concat([nomField, scaleField], axis=1)
    data.columns = ["category", "score"]
    
    #remove unused categories
    if categories is not None:
        data = data[data.category.isin(categories)]
    
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data.reset_index()
    
    cats = pd.unique(data["category"])
    
    k = len(cats)
    ncomp = k * (k - 1) / 2
    
    
    #overall n, mean and ss
    n = len(data["category"])
    m = data.score.mean()
    sst = data.score.var()*(n-1)
    
    #sample sizes, and means per category
    nj = data.groupby('category').count()
    sj = data.groupby('category').sum()
    mj = data.groupby('category').mean()
    
    ssb = float((nj*(mj-m)**2).sum())
    ssw = sst - ssb
    
    dfw = n - k
    msw = ssw/dfw
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow=0
    for i in range(0, k-1):
        for j in range(i+1, k):
            res.at[resRow, 0] = cats[i]
            res.at[resRow, 1] = cats[j]
            res.at[resRow, 2] = nj.iloc[i, 0]
            res.at[resRow, 3] = nj.iloc[j, 0]
            res.at[resRow, 4] = mj.iloc[i, 0]
            res.at[resRow, 5] = mj.iloc[j, 0]
            res.at[resRow, 6] = res.iloc[resRow, 4] - res.iloc[resRow, 5]
            res.at[resRow, 7] = 0
            
            sej = (msw * (1 / res.iloc[resRow, 2] + 1 / res.iloc[resRow, 3]))**0.5
            tVal = res.iloc[resRow, 6]/sej
            res.at[resRow, 8] = tVal
            
            res.at[resRow, 9] = dfw
            pValue = 2*(1-t.cdf(abs(tVal), dfw))
            res.at[resRow, 10] = pValue
            res.at[resRow, 11] = pValue*ncomp
            if res.iloc[resRow,11] > 1:
                res.iloc[resRow,11] = 1
            
            res.at[resRow, 12] = "Winer pairwise t"
            resRow = resRow + 1
    
    res.columns = ["category 1", "category 2", "n1", "n2", "mean 1", "mean 2", "sample diff.", "hyp diff.", "statistic", "df", "p-value", "adj. p-value", "test"]
    return res

Functions

def ph_pairwise_t(nomField, scaleField, categories=None)

Post-Hoc Pairwise Student T

This function performs pairwise independent samples Student t tests, for use after a one-way ANOVA, to determine which categories significantly differ from each other.

It differs slightly in the calculation of the standard error, than the version used by using ph_pairwise_is(nomField, scaleField, isTest = "student"). This version appears to be producing the same results as SPSS shows, when using a Bonferroni correction. SPSS refers to Winer (1962) for their procedures.

A simple Bonferroni correction is also applied.

Parameters

nomField : pandas series
data with categories
scaleField : pandas series
data with the scores
categories : list or dictionary, optional
the categories to use from catField

Returns

A data frame with:
 
  • category 1, the first category in the pair
  • category 2, the second category in the pair
  • n1, sample size of first category
  • n2, sample size of second category
  • mean 1, arithmetic mean of scores in first category
  • mean 2, arithmetic mean of scores in second category
  • sample diff., difference between the two arithmetic means
  • hyp diff., the hypothesized difference
  • statistic, the test-statistic
  • df, the degrees of freedom
  • p-value, the unadjusted p-value (significance)
  • adj. p-value, the Bonferroni adjusted p-values
  • test, description of test used

Notes

The formula used: t_{1,2} = \frac{\bar{x}_1 - \bar{x}_2}{\sqrt{MS_w \times\left(\frac{1}{n_1}+ \frac{1}{n_2}\right)}} df_w = n - k sig. = 2\times\left(1 - T\left(\left|t_{1,2}\right|, df_w\right)\right)

With: MS_w = \frac{SS_w}{df_w} SS_w = \sum_{j=1}^k \sum_{i=1}^{n_j} \left(x_{i,j} - \bar{x}_j\right)^2 \bar{x}_j = \frac{\sum_{i=1}^{n_j} x_{i,j}}{n_j}

Symbols used

  • x_{i,j}, the i-th score in category j
  • n, the total sample size
  • n_j, the number of scores in category j
  • k, the number of categories
  • \bar{x}_j, the mean of the scores in category j
  • MS_w, the mean square within
  • SS_w, the sum of squares of within (sum of squared deviation of the mean)
  • df_w, the degrees of freedom of within

A simple Bonferroni correction is applied for the multiple comparisons. This is simply: sig._{adj} = \min \left(sig. \times n_{comp}, 1\right)

With: $$n_{comp} = \frac{k\times\left(k-1\right)}{2}

Where k is the number of categories.

References

Winer, B. J. (1962). Statistical principles in experimental design. McGraw Hill.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def ph_pairwise_t(nomField, scaleField, categories=None):
    '''
    Post-Hoc Pairwise Student T
    ---------------------------
    This function performs pairwise independent samples Student t tests, for use after a one-way ANOVA, to determine which categories significantly differ from each other.
    
    It differs slightly in the calculation of the standard error, than the version used by using ph_pairwise_is(nomField, scaleField, isTest = "student"). This version appears to be producing the same results as SPSS shows, when using a Bonferroni correction. SPSS refers to Winer (1962) for their procedures.
    
    A simple Bonferroni correction is also applied.
    
    Parameters
    ----------
    nomField : pandas series
        data with categories
    scaleField : pandas series
        data with the scores
    categories : list or dictionary, optional
        the categories to use from catField
        
    Returns
    -------
    A data frame with:
    
    * *category 1*, the first category in the pair
    * *category 2*, the second category in the pair
    * *n1*, sample size of first category
    * *n2*, sample size of second category
    * *mean 1*, arithmetic mean of scores in first category
    * *mean 2*, arithmetic mean of scores in second category
    * *sample diff.*, difference between the two arithmetic means
    * *hyp diff.*, the hypothesized difference
    * *statistic*, the test-statistic
    * *df*, the degrees of freedom
    * *p-value*, the unadjusted p-value (significance)
    * *adj. p-value*, the Bonferroni adjusted p-values
    * *test*, description of test used
    
    Notes
    -----
    The formula used:
    $$t_{1,2} = \\frac{\\bar{x}_1 - \\bar{x}_2}{\\sqrt{MS_w \\times\\left(\\frac{1}{n_1}+ \\frac{1}{n_2}\\right)}}$$
    $$df_w = n - k$$
    $$sig. = 2\\times\\left(1 - T\\left(\\left|t_{1,2}\\right|, df_w\\right)\\right)$$
    
    With:
    $$MS_w = \\frac{SS_w}{df_w}$$
    $$SS_w = \\sum_{j=1}^k \\sum_{i=1}^{n_j} \\left(x_{i,j} - \\bar{x}_j\\right)^2$$
    $$\\bar{x}_j = \\frac{\\sum_{i=1}^{n_j} x_{i,j}}{n_j}$$
    
    *Symbols used*
    
    * \\(x_{i,j}\\), the i-th score in category j
    * \\(n\\), the total sample size
    * \\(n_j\\), the number of scores in category j
    * \\(k\\), the number of categories
    * \\(\\bar{x}_j\\), the mean of the scores in category j
    * \\(MS_w\\), the mean square within
    * \\(SS_w\\), the sum of squares of within (sum of squared deviation of the mean)
    * \\(df_w\\), the degrees of freedom of within
    
    A simple Bonferroni correction is applied for the multiple comparisons. This is simply:
    $$sig._{adj} = \\min \\left(sig. \\times n_{comp}, 1\\right)$$
    
    With:
    $$n_{comp} = \\frac{k\\times\\left(k-1\\right)}{2}
    
    Where \\(k\\) is the number of categories.
    
    References
    ----------
    Winer, B. J. (1962). *Statistical principles in experimental design*. McGraw Hill.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    
    if type(nomField) == list:
        nomField = pd.Series(nomField)
        
    if type(scaleField) == list:
        scaleField = pd.Series(scaleField)
        
    data = pd.concat([nomField, scaleField], axis=1)
    data.columns = ["category", "score"]
    
    #remove unused categories
    if categories is not None:
        data = data[data.category.isin(categories)]
    
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data.reset_index()
    
    cats = pd.unique(data["category"])
    
    k = len(cats)
    ncomp = k * (k - 1) / 2
    
    
    #overall n, mean and ss
    n = len(data["category"])
    m = data.score.mean()
    sst = data.score.var()*(n-1)
    
    #sample sizes, and means per category
    nj = data.groupby('category').count()
    sj = data.groupby('category').sum()
    mj = data.groupby('category').mean()
    
    ssb = float((nj*(mj-m)**2).sum())
    ssw = sst - ssb
    
    dfw = n - k
    msw = ssw/dfw
    
    ncomp = k * (k - 1) / 2
    res = pd.DataFrame()
    resRow=0
    for i in range(0, k-1):
        for j in range(i+1, k):
            res.at[resRow, 0] = cats[i]
            res.at[resRow, 1] = cats[j]
            res.at[resRow, 2] = nj.iloc[i, 0]
            res.at[resRow, 3] = nj.iloc[j, 0]
            res.at[resRow, 4] = mj.iloc[i, 0]
            res.at[resRow, 5] = mj.iloc[j, 0]
            res.at[resRow, 6] = res.iloc[resRow, 4] - res.iloc[resRow, 5]
            res.at[resRow, 7] = 0
            
            sej = (msw * (1 / res.iloc[resRow, 2] + 1 / res.iloc[resRow, 3]))**0.5
            tVal = res.iloc[resRow, 6]/sej
            res.at[resRow, 8] = tVal
            
            res.at[resRow, 9] = dfw
            pValue = 2*(1-t.cdf(abs(tVal), dfw))
            res.at[resRow, 10] = pValue
            res.at[resRow, 11] = pValue*ncomp
            if res.iloc[resRow,11] > 1:
                res.iloc[resRow,11] = 1
            
            res.at[resRow, 12] = "Winer pairwise t"
            resRow = resRow + 1
    
    res.columns = ["category 1", "category 2", "n1", "n2", "mean 1", "mean 2", "sample diff.", "hyp diff.", "statistic", "df", "p-value", "adj. p-value", "test"]
    return res