diff --git a/analysis.py b/analysis.py index cc008ae..b2b3e3a 100644 --- a/analysis.py +++ b/analysis.py @@ -76,31 +76,13 @@ with open("dataset.txt") as stream: # 2-7 - first to last place county idxs data_original = np.array(raw_data) -# table where counties are rows and counts of placements are columnes -# #1 | #2 | ... -# BA | 5 | 4 | ... -# ZA | 9 | 8 | ... -# KE | 4 | 6 | ... -# as a row-first 2d numpy array (first dimension will represent counties, second counts of placements) -# data = np.zeros((counties_c, 5)) # 5 because top five -# for sample in data_original: -# results = sample[2:7] -# for placement_idx, county_idx in enumerate(results): -# data[county_idx, placement_idx] += 1 - -# data is table where rows represent placement and columns county index -# 1st | 5 | 1 | 2 | ... -# 2nd | 3 | 0 | 7 | ... -# 3rd ... -# data = np.zeros((5, years * categories_c)) # same as (5, len(data_original)) -# for i, sample in enumerate(data_original): -# results = sample[2:7] -# for j in range(5): -# data[j][i] = results[j] - +# ------ +# H0: each county wins proportionally to its population +# H1: some counties win more than others after adjusting for population # wins per county -# goodness-of-fit problem using Chi Square +# goodness-of-fit problem using chi square # based on observed vs expected frequency + observed = np.zeros(counties_c) for sample in data_original: results = sample[2:7] @@ -110,34 +92,39 @@ for sample in data_original: print("Observed before adjusting for population:") print(observed) -# micro-wins per capita (because wins would be a tiny number) +# micro-wins per capita (because wins per capita would be a tiny number) for i in range(len(observed)): - observed[i] = observed[i] / counties_population[i] * 1_000_000 + observed[i] = observed[i] / counties_population[i] * 1_000_000 # `*million` because unit is micro -print("Observed after adjusting for population:") +print("Observed after adjusting for population [micro-wins per capita]:") print(observed) expected = np.ones_like(observed) * (sum(observed) / len(observed)) -print("Expected after adjusting for population:") +print("Expected:") print(expected) chi2, p = stats.chisquare(f_obs=observed, f_exp=expected) -print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}") +print(f"Chi-Square = {chi2:.4f}, p-value = {p:.8f}") +print("") -# H0: county and placement are independent -# H1: county and placement are not independent +# ------ +# H0: there is no difference between county placements (once they do get placed) +# H1: some counties get better placements than others +# kruskal wallis +# places grouped by counties -# print("\nAttempting Chi-Square test") -# chi2, p, dof, expected = stats.chi2_contingency(data) +# data is table where rows represent county and columns placements +# BA | 5 | 1 | 4 | ... +# TN | 2 | 2 | 3 | ... +# TT ... +data = [] +for _ in range(8): + data.append([]) +for sample in data_original: + results = sample[2:7] + for i, res in enumerate(results): + data[res].append(i + 1) # range is [1 - 5] -# print(f"Chi-Square Statistic: {chi2}") -# print(f"p-value: {p}") -# print(f"Degrees of Freedom: {dof}") -# print("Expected Frequencies:\n", expected) - -# print("\nAttempting Fisher's Exact test") -# oddsratio, p_value = stats.fisher_exact(data) - -# print(f"Odds Ratio: {oddsratio}") -# print(f"p-value: {p_value}") +F, p = stats.kruskal(*data) +print(f"Kruskal-Wallis F = {F:.4f}, p-value = {p:.8f}")