🧪 Adds goodness of fit test

2025-05-21 19:18:30 +02:00
parent bd60a9aa3f
commit c3651fb62e
1 changed files with 42 additions and 15 deletions
@@ -36,6 +36,9 @@ categories = [
 ]
 categories_c = 17  # how many categories
 # from how many years do we have data
 years = 9
 def map_counties(arr: List[str]) -> List[int]:
    ret = []
@@ -57,7 +60,6 @@ with open("dataset.txt") as stream:
        raw_data.append([year, category, *map_counties(wins_raw)])
 # 0 - year
 # 1 - abteilung (category) idx (starts at 1)
 # 2-7 - first to last place county idxs
@@ -69,28 +71,53 @@ data_original = np.array(raw_data)
 # ZA |  9 |  8 | ...
 # KE |  4 |  6 | ...
 # as a row-first 2d numpy array (first dimension will represent counties, second counts of placements)
-data = np.zeros((counties_c, 5))  # 5 because top five
+# data = np.zeros((counties_c, 5))  # 5 because top five
 # for sample in data_original:
 #     results = sample[2:7]
 #     for placement_idx, county_idx in enumerate(results):
 #         data[county_idx, placement_idx] += 1
 # data is table where rows represent placement and columns county index
 # 1st | 5 | 1 | 2 | ...
 # 2nd | 3 | 0 | 7 | ...
 # 3rd ...
 # data = np.zeros((5, years * categories_c))  # same as (5, len(data_original))
 # for i, sample in enumerate(data_original):
 #     results = sample[2:7]
 #     for j in range(5):
 #         data[j][i] = results[j]
 # wins per county
 # goodness-of-fit problem using Chi Square
 # based on observed vs expected frequency
 observed = np.zeros(counties_c)
 for sample in data_original:
    results = sample[2:7]
-    for placement_idx, county_idx in enumerate(results):
+    for i in results:
-        data[county_idx, placement_idx] += 1
+        observed[i] += 1
 expected = np.ones_like(observed) * (sum(observed) / len(observed))
 print("Data:")
-print(data)
+print(observed)
 print(expected)
 chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
 print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}")
 # H0: county and placement are independent
 # H1: county and placement are not independent
 print("\nAttempting Chi-Square test")
 chi2, p, dof, expected = stats.chi2_contingency(data)
-print(f"Chi-Square Statistic: {chi2}")
+# print("\nAttempting Chi-Square test")
-print(f"p-value: {p}")
+# chi2, p, dof, expected = stats.chi2_contingency(data)
-print(f"Degrees of Freedom: {dof}")
+
 # print(f"Chi-Square Statistic: {chi2}")
 # print(f"p-value: {p}")
 # print(f"Degrees of Freedom: {dof}")
 # print("Expected Frequencies:\n", expected)
-print("\nAttempting Fisher's Exact test")
+# print("\nAttempting Fisher's Exact test")
-oddsratio, p_value = stats.fisher_exact(data)
+# oddsratio, p_value = stats.fisher_exact(data)
-print(f"Odds Ratio: {oddsratio}")
+# print(f"Odds Ratio: {oddsratio}")
-print(f"p-value: {p_value}")
+# print(f"p-value: {p_value}")