Module stikpetP.correlations.cor_pearson

Expand source code
import pandas as pd
from math import atanh
from statistics import NormalDist
from scipy.stats import t 

def r_pearson(field1, field2, corr=None, test = "t"):
    '''
    Pearson Product-Moment Correlation Coefficient
    ----------------------------------------------
    A measure of linear correlation. A -1 indicates a perfect negative linear correlation (i.e. a straight line going down, if the score in one field goes up, the other one goes down), a 0 would indicate no correlation, and a +1 a perfect positive linear correlation (i.e. a straight line going up, if the score in one field goes up, the other one goes up as well).
    
    Various tests can be used to determine if the coefficient is significantly different from zero. See notes for details.
    
    Parameters
    ----------
    field1 : pandas series
        the numeric scores of the first variable
    field2 : pandas series
        the numeric scores of the second variable
    corr : {None, "fisher", "olkin-pratt"}, optional
        correction to be applied
    test : {"t", "z"}, optional
        which test to perform. Default is "t"
    
    Returns
    -------
    res : dataframe, with
    
    * "r", the coefficient value
    * "statistic", the test statistic (t or z-value)
    * "df", degrees of freedom (only applicable for t-test)
    * "p-value", significance (p-value)
    
    Notes
    -----
    The formula used for the correlation coefficient (Pearson, 1896, p. 265):
    $$r = \\frac{\\sum_{i=1}^n \\left(x_i - \\bar{x}\\right)\\left(y_i - \\bar{y}\\right)}{n\\times \\sigma_x\\times\\sigma_y}$$
    
    This can be rewritten to:
    $$r = \\frac{\\sum_{i=1}^n \\left(x_i - \\bar{x}\\right)\\left(y_i - \\bar{y}\\right)}{\\sqrt{SS_x\\times SS_y}}$$
    
    With:
    $$SS_x = \\sum_{i=1}^{n_x} \\left(x_i - \\bar{x}\\right)$$
    $$SS_y = \\sum_{i=1}^{n_y} \\left(y_i - \\bar{y}\\right)$$
    $$\\bar{x} = \\frac{\\sum_{i=1}^{n_x} x_i}{n_x}$$
    $$\\bar{y} = \\frac{\\sum_{i=1}^{n_y} x_i}{n_y}$$
    $$\\sigma_x = \\frac{\\sum_{i=1}^{n_x} \\left(x_i - \\bar{x}\\right)^2}{n_x}$$
    $$\\sigma_y = \\frac{\\sum_{i=1}^{n_y} \\left(y_i - \\bar{y}\\right)^2}{n_y}$$
    
    Note that \\(n = n_x = n_y\\), since only cases where scores are available on both variables are used.
    
    The Student t-approximation is done using (Pugh & Winslow, 1966, pp. 196,199):
    $$t_r = r\\times\\sqrt{\\frac{n-2}{1-r^2}}$$
    $$df = n - 2$$
    $$sig. = 2\\times\\left(1 - T\\left(\\left|t_r\\right|, df\\right)\\right)$$
    
    The standard normal approximation is done using (Fisher, 1915, p. 521; Steiger, 1980, p. 246):
    $$z_r = \\text{atanh}\\left(r\\right)\\times\\sqrt{n - 3}$$
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|z_r\\right|\\right)\\right)$$
    
    The Fisher correction is done using (Fisher, 1915, p. 521):
    $$r_f = r\\times\\left(1+\\frac{1-r^2}{2\\times\\pi}\\right)$$
    
    The Olkin-Pratt correction is done using (Olkin & Pratt, 1958, p. 203):
    $$r_{op} = r\\times\\left(1+\\frac{1-r^2}{2\\times\\left(n-3\\right)}\\right)$$
    
    *Symbols Used*
    
    * \\(x_i\\), is the i-th score from variable x
    * \\(y_i\\), is the i-th score from variable y
    * \\(\\bar{x}\\), is the average of the scores from variable x
    * \\(\\bar{y}\\), is the average of the scores from variable y
    * \\(n_x\\), is the number of scores from variable x
    * \\(n_y\\), is the number of scores from variable y
    * \\(n\\), is the number of pairs of x and y
    * \\(T\\left(\\dots\\right)\\), is the cumulative density function of a T distribution
    * \\(\\Phi\\left(\\dots\\right)\\), is the cumulative density function of a standard normal distribution
    
    References
    ----------
    Fisher, R. A. (1915). Frequency distribution of the values of the correlation coefficient in samples from an indefinitely large population. *Biometrika, 10*(4), 507–521. doi:10.2307/2331838
    
    Olkin, I., & Pratt, J. W. (1958). Unbiased estimation of certain correlation coefficients. *The Annals of Mathematical Statistics, 29*(1), 201–211. doi:10.1214/aoms/1177706717
    
    Pearson, K. (1896). Contributions to the mathematical theory of evolution. III. Regression, Heredity, and Panmixia. *Philosophical Transactions of the Royal Society of London. (A.)*, 1896, 253–318.
    
    Pugh, E. M., & Winslow, G. H. (1966). *The analysis of physical measurements*. Addison-Wesley.
    
    Steiger, J. H. (1980). Tests for comparing elements of a correlation matrix. *Psychological Bulletin, 87*(2), 245–251. doi:10.1037/0033-2909.87.2.245
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    '''
    if type(field1) == list:
        field1 = pd.Series(field1)
        
    if type(field2) == list:
        field2 = pd.Series(field2)
    
    r = field1.corr(field2)
    
    data = pd.concat([field1, field2], axis=1)
    data.columns = ["field1", "field2"]
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data.reset_index()
    
    #overall n, mean and ss
    n = len(data["field1"])
    
    #Corrections
    if corr == "fisher":
        r = r * (1 + (1 - r**2) / (2 * n))
        
    elif corr == "olkin-pratt":
        r = r * (1 + (1 - r**2) / (2 * (n - 3)))
    
    if test == "t":
        
        tVal = r * ((n - 2) / (1 - r**2))**0.5
        df = n - 2
        pvalue = 2*(1 - t.cdf(abs(tVal), df))
        statistic = tVal
        
    elif test == "z":
        z = abs(atanh(r)) * (n - 3)**0.5
        pvalue = 2 * (1 - NormalDist().cdf(abs(z)))
        df = None
        statistic = z
        
    res = pd.DataFrame([[r, statistic, df, pvalue]])
    res.columns = ["r", "statistic", "df", "p-value"]
    
    return res

Functions

def r_pearson(field1, field2, corr=None, test='t')

Pearson Product-Moment Correlation Coefficient

A measure of linear correlation. A -1 indicates a perfect negative linear correlation (i.e. a straight line going down, if the score in one field goes up, the other one goes down), a 0 would indicate no correlation, and a +1 a perfect positive linear correlation (i.e. a straight line going up, if the score in one field goes up, the other one goes up as well).

Various tests can be used to determine if the coefficient is significantly different from zero. See notes for details.

Parameters

field1 : pandas series
the numeric scores of the first variable
field2 : pandas series
the numeric scores of the second variable
corr : {None, "fisher", "olkin-pratt"}, optional
correction to be applied
test : {"t", "z"}, optional
which test to perform. Default is "t"

Returns

res : dataframe, with
 
  • "r", the coefficient value
  • "statistic", the test statistic (t or z-value)
  • "df", degrees of freedom (only applicable for t-test)
  • "p-value", significance (p-value)

Notes

The formula used for the correlation coefficient (Pearson, 1896, p. 265): r = \frac{\sum_{i=1}^n \left(x_i - \bar{x}\right)\left(y_i - \bar{y}\right)}{n\times \sigma_x\times\sigma_y}

This can be rewritten to: r = \frac{\sum_{i=1}^n \left(x_i - \bar{x}\right)\left(y_i - \bar{y}\right)}{\sqrt{SS_x\times SS_y}}

With: SS_x = \sum_{i=1}^{n_x} \left(x_i - \bar{x}\right) SS_y = \sum_{i=1}^{n_y} \left(y_i - \bar{y}\right) \bar{x} = \frac{\sum_{i=1}^{n_x} x_i}{n_x} \bar{y} = \frac{\sum_{i=1}^{n_y} x_i}{n_y} \sigma_x = \frac{\sum_{i=1}^{n_x} \left(x_i - \bar{x}\right)^2}{n_x} \sigma_y = \frac{\sum_{i=1}^{n_y} \left(y_i - \bar{y}\right)^2}{n_y}

Note that n = n_x = n_y, since only cases where scores are available on both variables are used.

The Student t-approximation is done using (Pugh & Winslow, 1966, pp. 196,199): t_r = r\times\sqrt{\frac{n-2}{1-r^2}} df = n - 2 sig. = 2\times\left(1 - T\left(\left|t_r\right|, df\right)\right)

The standard normal approximation is done using (Fisher, 1915, p. 521; Steiger, 1980, p. 246): z_r = \text{atanh}\left(r\right)\times\sqrt{n - 3} sig. = 2\times\left(1 - \Phi\left(\left|z_r\right|\right)\right)

The Fisher correction is done using (Fisher, 1915, p. 521): r_f = r\times\left(1+\frac{1-r^2}{2\times\pi}\right)

The Olkin-Pratt correction is done using (Olkin & Pratt, 1958, p. 203): r_{op} = r\times\left(1+\frac{1-r^2}{2\times\left(n-3\right)}\right)

Symbols Used

  • x_i, is the i-th score from variable x
  • y_i, is the i-th score from variable y
  • \bar{x}, is the average of the scores from variable x
  • \bar{y}, is the average of the scores from variable y
  • n_x, is the number of scores from variable x
  • n_y, is the number of scores from variable y
  • n, is the number of pairs of x and y
  • T\left(\dots\right), is the cumulative density function of a T distribution
  • \Phi\left(\dots\right), is the cumulative density function of a standard normal distribution

References

Fisher, R. A. (1915). Frequency distribution of the values of the correlation coefficient in samples from an indefinitely large population. Biometrika, 10(4), 507–521. doi:10.2307/2331838

Olkin, I., & Pratt, J. W. (1958). Unbiased estimation of certain correlation coefficients. The Annals of Mathematical Statistics, 29(1), 201–211. doi:10.1214/aoms/1177706717

Pearson, K. (1896). Contributions to the mathematical theory of evolution. III. Regression, Heredity, and Panmixia. Philosophical Transactions of the Royal Society of London. (A.), 1896, 253–318.

Pugh, E. M., & Winslow, G. H. (1966). The analysis of physical measurements. Addison-Wesley.

Steiger, J. H. (1980). Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87(2), 245–251. doi:10.1037/0033-2909.87.2.245

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def r_pearson(field1, field2, corr=None, test = "t"):
    '''
    Pearson Product-Moment Correlation Coefficient
    ----------------------------------------------
    A measure of linear correlation. A -1 indicates a perfect negative linear correlation (i.e. a straight line going down, if the score in one field goes up, the other one goes down), a 0 would indicate no correlation, and a +1 a perfect positive linear correlation (i.e. a straight line going up, if the score in one field goes up, the other one goes up as well).
    
    Various tests can be used to determine if the coefficient is significantly different from zero. See notes for details.
    
    Parameters
    ----------
    field1 : pandas series
        the numeric scores of the first variable
    field2 : pandas series
        the numeric scores of the second variable
    corr : {None, "fisher", "olkin-pratt"}, optional
        correction to be applied
    test : {"t", "z"}, optional
        which test to perform. Default is "t"
    
    Returns
    -------
    res : dataframe, with
    
    * "r", the coefficient value
    * "statistic", the test statistic (t or z-value)
    * "df", degrees of freedom (only applicable for t-test)
    * "p-value", significance (p-value)
    
    Notes
    -----
    The formula used for the correlation coefficient (Pearson, 1896, p. 265):
    $$r = \\frac{\\sum_{i=1}^n \\left(x_i - \\bar{x}\\right)\\left(y_i - \\bar{y}\\right)}{n\\times \\sigma_x\\times\\sigma_y}$$
    
    This can be rewritten to:
    $$r = \\frac{\\sum_{i=1}^n \\left(x_i - \\bar{x}\\right)\\left(y_i - \\bar{y}\\right)}{\\sqrt{SS_x\\times SS_y}}$$
    
    With:
    $$SS_x = \\sum_{i=1}^{n_x} \\left(x_i - \\bar{x}\\right)$$
    $$SS_y = \\sum_{i=1}^{n_y} \\left(y_i - \\bar{y}\\right)$$
    $$\\bar{x} = \\frac{\\sum_{i=1}^{n_x} x_i}{n_x}$$
    $$\\bar{y} = \\frac{\\sum_{i=1}^{n_y} x_i}{n_y}$$
    $$\\sigma_x = \\frac{\\sum_{i=1}^{n_x} \\left(x_i - \\bar{x}\\right)^2}{n_x}$$
    $$\\sigma_y = \\frac{\\sum_{i=1}^{n_y} \\left(y_i - \\bar{y}\\right)^2}{n_y}$$
    
    Note that \\(n = n_x = n_y\\), since only cases where scores are available on both variables are used.
    
    The Student t-approximation is done using (Pugh & Winslow, 1966, pp. 196,199):
    $$t_r = r\\times\\sqrt{\\frac{n-2}{1-r^2}}$$
    $$df = n - 2$$
    $$sig. = 2\\times\\left(1 - T\\left(\\left|t_r\\right|, df\\right)\\right)$$
    
    The standard normal approximation is done using (Fisher, 1915, p. 521; Steiger, 1980, p. 246):
    $$z_r = \\text{atanh}\\left(r\\right)\\times\\sqrt{n - 3}$$
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|z_r\\right|\\right)\\right)$$
    
    The Fisher correction is done using (Fisher, 1915, p. 521):
    $$r_f = r\\times\\left(1+\\frac{1-r^2}{2\\times\\pi}\\right)$$
    
    The Olkin-Pratt correction is done using (Olkin & Pratt, 1958, p. 203):
    $$r_{op} = r\\times\\left(1+\\frac{1-r^2}{2\\times\\left(n-3\\right)}\\right)$$
    
    *Symbols Used*
    
    * \\(x_i\\), is the i-th score from variable x
    * \\(y_i\\), is the i-th score from variable y
    * \\(\\bar{x}\\), is the average of the scores from variable x
    * \\(\\bar{y}\\), is the average of the scores from variable y
    * \\(n_x\\), is the number of scores from variable x
    * \\(n_y\\), is the number of scores from variable y
    * \\(n\\), is the number of pairs of x and y
    * \\(T\\left(\\dots\\right)\\), is the cumulative density function of a T distribution
    * \\(\\Phi\\left(\\dots\\right)\\), is the cumulative density function of a standard normal distribution
    
    References
    ----------
    Fisher, R. A. (1915). Frequency distribution of the values of the correlation coefficient in samples from an indefinitely large population. *Biometrika, 10*(4), 507–521. doi:10.2307/2331838
    
    Olkin, I., & Pratt, J. W. (1958). Unbiased estimation of certain correlation coefficients. *The Annals of Mathematical Statistics, 29*(1), 201–211. doi:10.1214/aoms/1177706717
    
    Pearson, K. (1896). Contributions to the mathematical theory of evolution. III. Regression, Heredity, and Panmixia. *Philosophical Transactions of the Royal Society of London. (A.)*, 1896, 253–318.
    
    Pugh, E. M., & Winslow, G. H. (1966). *The analysis of physical measurements*. Addison-Wesley.
    
    Steiger, J. H. (1980). Tests for comparing elements of a correlation matrix. *Psychological Bulletin, 87*(2), 245–251. doi:10.1037/0033-2909.87.2.245
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076   
    
    '''
    if type(field1) == list:
        field1 = pd.Series(field1)
        
    if type(field2) == list:
        field2 = pd.Series(field2)
    
    r = field1.corr(field2)
    
    data = pd.concat([field1, field2], axis=1)
    data.columns = ["field1", "field2"]
    #Remove rows with missing values and reset index
    data = data.dropna()    
    data.reset_index()
    
    #overall n, mean and ss
    n = len(data["field1"])
    
    #Corrections
    if corr == "fisher":
        r = r * (1 + (1 - r**2) / (2 * n))
        
    elif corr == "olkin-pratt":
        r = r * (1 + (1 - r**2) / (2 * (n - 3)))
    
    if test == "t":
        
        tVal = r * ((n - 2) / (1 - r**2))**0.5
        df = n - 2
        pvalue = 2*(1 - t.cdf(abs(tVal), df))
        statistic = tVal
        
    elif test == "z":
        z = abs(atanh(r)) * (n - 3)**0.5
        pvalue = 2 * (1 - NormalDist().cdf(abs(z)))
        df = None
        statistic = z
        
    res = pd.DataFrame([[r, statistic, df, pvalue]])
    res.columns = ["r", "statistic", "df", "p-value"]
    
    return res