import pandas as pd
from scipy.stats import chi2

def ts_pearson_gof(data, expCount=None, cc=None):
    '''
    Pearson Chi-Square Goodness-of-Fit Test
     
    This function will perform Pearson chi-square goodness-of-fit test
    
    Parameters
    ----------
    data : list or Pandas data series with the data
    expCount : Optional Pandas data frame with categories and expected counts
    cc : Optional which continuity correction to use, either None (default), "yates", "pearson", or "williams"
        
    Returns
    -------
    testResults : Pandas dataframe with the test statistic, degrees of freedom, significance (p-value) and test used
   
    Notes
    -----
    It uses chi2 from scipy's stats library
    
    Author
    ------
    Made by P. Stikker
    
    Please visit: https://PeterStatistics.com
    
    YouTube channel: https://www.youtube.com/stikpet
    
    '''
    #the sample size n
    n = len(data)
    
    #determine the observed counts
    
    if expCount is None:
        #generate frequency table
        freq = data.value_counts()
        freq = freq.rename_axis('category').reset_index(name='counts')
        
        #number of categories to use (k)
        k = len(freq)
        
        #number of expected counts is simply sample size
        nE = n
    else:
        #if expected counts are given
        
        #number of categories to use (k)
        k = len(expCount)
        
        freq = pd.DataFrame(columns = ["category", "count"])
        for i in range(0, k):
            nk = data[data==expCount.iloc[i, 0]].count()[0]
            lk = expCount.iloc[i, 0]
            freq = pd.concat([freq, pd.DataFrame([{"category": lk, "count": nk}])])
            nE = sum(eCounts.iloc[:,1])
            
        freq = freq.reset_index(drop=True)
            
    #the degrees of freedom
    df = k - 1
    
    #the true expected counts
    if expCount is None:
        #assume all to be equal
        expC = [n/k] * k
        
    else:
        #check if categories match
        expC = []
        for i in range(0,k):
            expC.append(expCount.iloc[i, 1]/nE*n)
            
    #calculate the chi-square value
    chiVal = 0
    if cc is None or cc == "pearson" or cc == "williams":
        for i in range(0, k):
            chiVal = chiVal + ((freq.iloc[i, 1]) - expC[i])**2 / expC[i]

        if not (cc is None) and cc == "pearson":
            chiVal = (n - 1) / n * chiVal
        elif not (cc is None) and cc == "williams":
            chiVal = chiVal / (1 + (k ^ 2 - 1) / (6 * n * (k - 1)))
        
    elif not (cc is None) and cc == "yates":
        for i in range(0, k):
            chiVal = chiVal + (abs((freq.iloc[i, 1]) - expC[i]) - 0.5)**2 / expC[i]
    
    pVal = chi2.sf(chiVal, df)
    
    #Which test was used
    testUsed = "Pearson chi-square test of goodness-of-fit"
    if not (cc is None) and cc == "pearson":
        testUsed = testUsed + ", with E. Pearson continuity correction"
    elif not (cc is None) and cc == "williams":
        testUsed = testUsed + ", with Williams continuity correction"
    elif not (cc is None) and cc == "yates":
        testUsed = testUsed + ", with Yates continuity correction"
    
    testResults = pd.DataFrame([[chiVal, df, pVal, testUsed]], columns=["statistic", "df", "p-value", "test"])
    pd.set_option('display.max_colwidth', None)
    
    return testResults

#Example
data = pd.DataFrame(["MARRIED", "DIVORCED", "MARRIED", "SEPARATED", "DIVORCED", 
                     "NEVER MARRIED", "DIVORCED", "DIVORCED", "NEVER MARRIED", 
                     "MARRIED", "MARRIED", "MARRIED", "SEPARATED", "DIVORCED", 
                     "NEVER MARRIED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "MARRIED"], 
                    columns=["marital"])

ts_pearson_gof(data)

eCounts = pd.DataFrame({'category' : ["MARRIED", "DIVORCED", "NEVER MARRIED", "SEPARATED"], 'count' : [5,5,5,5]})
ts_pearson_gof(data, eCounts)

ts_pearson_gof(data, cc="pearson")