From c3651fb62efb649792fb0f2108d68fd34f15db15 Mon Sep 17 00:00:00 2001
From: Daniel Svitan <daniel.svitan.team7274dev@gmail.com>
Date: Wed, 21 May 2025 19:18:30 +0200
Subject: [PATCH] :test_tube: Adds goodness of fit test

---
 analysis.py | 57 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/analysis.py b/analysis.py
index e616eec..a6e8e99 100644
--- a/analysis.py
+++ b/analysis.py
@@ -36,6 +36,9 @@ categories = [
 ]
 categories_c = 17  # how many categories
 
+# from how many years do we have data
+years = 9
+
 
 def map_counties(arr: List[str]) -> List[int]:
     ret = []
@@ -57,7 +60,6 @@ with open("dataset.txt") as stream:
 
         raw_data.append([year, category, *map_counties(wins_raw)])
 
-
 # 0 - year
 # 1 - abteilung (category) idx (starts at 1)
 # 2-7 - first to last place county idxs
@@ -69,28 +71,53 @@ data_original = np.array(raw_data)
 # ZA |  9 |  8 | ...
 # KE |  4 |  6 | ...
 # as a row-first 2d numpy array (first dimension will represent counties, second counts of placements)
-data = np.zeros((counties_c, 5))  # 5 because top five
+# data = np.zeros((counties_c, 5))  # 5 because top five
+# for sample in data_original:
+#     results = sample[2:7]
+#     for placement_idx, county_idx in enumerate(results):
+#         data[county_idx, placement_idx] += 1
+
+# data is table where rows represent placement and columns county index
+# 1st | 5 | 1 | 2 | ...
+# 2nd | 3 | 0 | 7 | ...
+# 3rd ...
+# data = np.zeros((5, years * categories_c))  # same as (5, len(data_original))
+# for i, sample in enumerate(data_original):
+#     results = sample[2:7]
+#     for j in range(5):
+#         data[j][i] = results[j]
+
+# wins per county
+# goodness-of-fit problem using Chi Square
+# based on observed vs expected frequency
+observed = np.zeros(counties_c)
 for sample in data_original:
     results = sample[2:7]
-    for placement_idx, county_idx in enumerate(results):
-        data[county_idx, placement_idx] += 1
+    for i in results:
+        observed[i] += 1
 
+expected = np.ones_like(observed) * (sum(observed) / len(observed))
 
 print("Data:")
-print(data)
+print(observed)
+print(expected)
+
+chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
+print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}")
 
 # H0: county and placement are independent
 # H1: county and placement are not independent
-print("\nAttempting Chi-Square test")
-chi2, p, dof, expected = stats.chi2_contingency(data)
 
-print(f"Chi-Square Statistic: {chi2}")
-print(f"p-value: {p}")
-print(f"Degrees of Freedom: {dof}")
-#print("Expected Frequencies:\n", expected)
+# print("\nAttempting Chi-Square test")
+# chi2, p, dof, expected = stats.chi2_contingency(data)
 
-print("\nAttempting Fisher's Exact test")
-oddsratio, p_value = stats.fisher_exact(data)
+# print(f"Chi-Square Statistic: {chi2}")
+# print(f"p-value: {p}")
+# print(f"Degrees of Freedom: {dof}")
+# print("Expected Frequencies:\n", expected)
 
-print(f"Odds Ratio: {oddsratio}")
-print(f"p-value: {p_value}")
+# print("\nAttempting Fisher's Exact test")
+# oddsratio, p_value = stats.fisher_exact(data)
+
+# print(f"Odds Ratio: {oddsratio}")
+# print(f"p-value: {p_value}")