🔨 Adds kruskal and reworks chi square
This commit is contained in:
parent
48b1ac4753
commit
5f6656d710
71
analysis.py
71
analysis.py
@ -76,31 +76,13 @@ with open("dataset.txt") as stream:
|
|||||||
# 2-7 - first to last place county idxs
|
# 2-7 - first to last place county idxs
|
||||||
data_original = np.array(raw_data)
|
data_original = np.array(raw_data)
|
||||||
|
|
||||||
# table where counties are rows and counts of placements are columnes
|
# ------
|
||||||
# #1 | #2 | ...
|
# H0: each county wins proportionally to its population
|
||||||
# BA | 5 | 4 | ...
|
# H1: some counties win more than others after adjusting for population
|
||||||
# ZA | 9 | 8 | ...
|
|
||||||
# KE | 4 | 6 | ...
|
|
||||||
# as a row-first 2d numpy array (first dimension will represent counties, second counts of placements)
|
|
||||||
# data = np.zeros((counties_c, 5)) # 5 because top five
|
|
||||||
# for sample in data_original:
|
|
||||||
# results = sample[2:7]
|
|
||||||
# for placement_idx, county_idx in enumerate(results):
|
|
||||||
# data[county_idx, placement_idx] += 1
|
|
||||||
|
|
||||||
# data is table where rows represent placement and columns county index
|
|
||||||
# 1st | 5 | 1 | 2 | ...
|
|
||||||
# 2nd | 3 | 0 | 7 | ...
|
|
||||||
# 3rd ...
|
|
||||||
# data = np.zeros((5, years * categories_c)) # same as (5, len(data_original))
|
|
||||||
# for i, sample in enumerate(data_original):
|
|
||||||
# results = sample[2:7]
|
|
||||||
# for j in range(5):
|
|
||||||
# data[j][i] = results[j]
|
|
||||||
|
|
||||||
# wins per county
|
# wins per county
|
||||||
# goodness-of-fit problem using Chi Square
|
# goodness-of-fit problem using chi square
|
||||||
# based on observed vs expected frequency
|
# based on observed vs expected frequency
|
||||||
|
|
||||||
observed = np.zeros(counties_c)
|
observed = np.zeros(counties_c)
|
||||||
for sample in data_original:
|
for sample in data_original:
|
||||||
results = sample[2:7]
|
results = sample[2:7]
|
||||||
@ -110,34 +92,39 @@ for sample in data_original:
|
|||||||
print("Observed before adjusting for population:")
|
print("Observed before adjusting for population:")
|
||||||
print(observed)
|
print(observed)
|
||||||
|
|
||||||
# micro-wins per capita (because wins would be a tiny number)
|
# micro-wins per capita (because wins per capita would be a tiny number)
|
||||||
for i in range(len(observed)):
|
for i in range(len(observed)):
|
||||||
observed[i] = observed[i] / counties_population[i] * 1_000_000
|
observed[i] = observed[i] / counties_population[i] * 1_000_000 # `*million` because unit is micro
|
||||||
|
|
||||||
print("Observed after adjusting for population:")
|
print("Observed after adjusting for population [micro-wins per capita]:")
|
||||||
print(observed)
|
print(observed)
|
||||||
|
|
||||||
expected = np.ones_like(observed) * (sum(observed) / len(observed))
|
expected = np.ones_like(observed) * (sum(observed) / len(observed))
|
||||||
|
|
||||||
print("Expected after adjusting for population:")
|
print("Expected:")
|
||||||
print(expected)
|
print(expected)
|
||||||
|
|
||||||
chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
|
chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
|
||||||
print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}")
|
print(f"Chi-Square = {chi2:.4f}, p-value = {p:.8f}")
|
||||||
|
print("")
|
||||||
|
|
||||||
# H0: county and placement are independent
|
# ------
|
||||||
# H1: county and placement are not independent
|
# H0: there is no difference between county placements (once they do get placed)
|
||||||
|
# H1: some counties get better placements than others
|
||||||
|
# kruskal wallis
|
||||||
|
# places grouped by counties
|
||||||
|
|
||||||
# print("\nAttempting Chi-Square test")
|
# data is table where rows represent county and columns placements
|
||||||
# chi2, p, dof, expected = stats.chi2_contingency(data)
|
# BA | 5 | 1 | 4 | ...
|
||||||
|
# TN | 2 | 2 | 3 | ...
|
||||||
|
# TT ...
|
||||||
|
data = []
|
||||||
|
for _ in range(8):
|
||||||
|
data.append([])
|
||||||
|
for sample in data_original:
|
||||||
|
results = sample[2:7]
|
||||||
|
for i, res in enumerate(results):
|
||||||
|
data[res].append(i + 1) # range is [1 - 5]
|
||||||
|
|
||||||
# print(f"Chi-Square Statistic: {chi2}")
|
F, p = stats.kruskal(*data)
|
||||||
# print(f"p-value: {p}")
|
print(f"Kruskal-Wallis F = {F:.4f}, p-value = {p:.8f}")
|
||||||
# print(f"Degrees of Freedom: {dof}")
|
|
||||||
# print("Expected Frequencies:\n", expected)
|
|
||||||
|
|
||||||
# print("\nAttempting Fisher's Exact test")
|
|
||||||
# oddsratio, p_value = stats.fisher_exact(data)
|
|
||||||
|
|
||||||
# print(f"Odds Ratio: {oddsratio}")
|
|
||||||
# print(f"p-value: {p_value}")
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user