🔨 Adds kruskal and reworks chi square

2025-05-21 20:09:56 +02:00
parent 48b1ac4753
commit 5f6656d710
1 changed files with 29 additions and 42 deletions
--- a/analysis.py
+++ b/analysis.py
@@ -76,31 +76,13 @@ with open("dataset.txt") as stream:
 # 2-7 - first to last place county idxs
 data_original = np.array(raw_data)
-# table where counties are rows and counts of placements are columnes
+# ------
-#      #1 | #2 | ...
+# H0: each county wins proportionally to its population
-# BA |  5 |  4 | ...
+# H1: some counties win more than others after adjusting for population
 # ZA |  9 |  8 | ...
 # KE |  4 |  6 | ...
 # as a row-first 2d numpy array (first dimension will represent counties, second counts of placements)
 # data = np.zeros((counties_c, 5))  # 5 because top five
 # for sample in data_original:
 #     results = sample[2:7]
 #     for placement_idx, county_idx in enumerate(results):
 #         data[county_idx, placement_idx] += 1
 # data is table where rows represent placement and columns county index
 # 1st | 5 | 1 | 2 | ...
 # 2nd | 3 | 0 | 7 | ...
 # 3rd ...
 # data = np.zeros((5, years * categories_c))  # same as (5, len(data_original))
 # for i, sample in enumerate(data_original):
 #     results = sample[2:7]
 #     for j in range(5):
 #         data[j][i] = results[j]
 # wins per county
-# goodness-of-fit problem using Chi Square
+# goodness-of-fit problem using chi square
 # based on observed vs expected frequency
 observed = np.zeros(counties_c)
 for sample in data_original:
    results = sample[2:7]
@@ -110,34 +92,39 @@ for sample in data_original:
 print("Observed before adjusting for population:")
 print(observed)
-# micro-wins per capita (because wins would be a tiny number)
+# micro-wins per capita (because wins per capita would be a tiny number)
 for i in range(len(observed)):
-    observed[i] = observed[i] / counties_population[i] * 1_000_000
+    observed[i] = observed[i] / counties_population[i] * 1_000_000  # `*million` because unit is micro
-print("Observed after adjusting for population:")
+print("Observed after adjusting for population [micro-wins per capita]:")
 print(observed)
 expected = np.ones_like(observed) * (sum(observed) / len(observed))
-print("Expected after adjusting for population:")
+print("Expected:")
 print(expected)
 chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
-print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}")
+print(f"Chi-Square = {chi2:.4f}, p-value = {p:.8f}")
 print("")
-# H0: county and placement are independent
+# ------
-# H1: county and placement are not independent
+# H0: there is no difference between county placements (once they do get placed)
 # H1: some counties get better placements than others
 # kruskal wallis
 # places grouped by counties
-# print("\nAttempting Chi-Square test")
+# data is table where rows represent county and columns placements
-# chi2, p, dof, expected = stats.chi2_contingency(data)
+# BA | 5 | 1 | 4 | ...
 # TN | 2 | 2 | 3 | ...
 # TT ...
 data = []
 for _ in range(8):
    data.append([])
 for sample in data_original:
    results = sample[2:7]
    for i, res in enumerate(results):
        data[res].append(i + 1)  # range is [1 - 5]
-# print(f"Chi-Square Statistic: {chi2}")
+F, p = stats.kruskal(*data)
-# print(f"p-value: {p}")
+print(f"Kruskal-Wallis F = {F:.4f}, p-value = {p:.8f}")
 # print(f"Degrees of Freedom: {dof}")
 # print("Expected Frequencies:\n", expected)
 # print("\nAttempting Fisher's Exact test")
 # oddsratio, p_value = stats.fisher_exact(data)
 # print(f"Odds Ratio: {oddsratio}")
 # print(f"p-value: {p_value}")