From c3651fb62efb649792fb0f2108d68fd34f15db15 Mon Sep 17 00:00:00 2001 From: Daniel Svitan Date: Wed, 21 May 2025 19:18:30 +0200 Subject: [PATCH] :test_tube: Adds goodness of fit test --- analysis.py | 57 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/analysis.py b/analysis.py index e616eec..a6e8e99 100644 --- a/analysis.py +++ b/analysis.py @@ -36,6 +36,9 @@ categories = [ ] categories_c = 17 # how many categories +# from how many years do we have data +years = 9 + def map_counties(arr: List[str]) -> List[int]: ret = [] @@ -57,7 +60,6 @@ with open("dataset.txt") as stream: raw_data.append([year, category, *map_counties(wins_raw)]) - # 0 - year # 1 - abteilung (category) idx (starts at 1) # 2-7 - first to last place county idxs @@ -69,28 +71,53 @@ data_original = np.array(raw_data) # ZA | 9 | 8 | ... # KE | 4 | 6 | ... # as a row-first 2d numpy array (first dimension will represent counties, second counts of placements) -data = np.zeros((counties_c, 5)) # 5 because top five +# data = np.zeros((counties_c, 5)) # 5 because top five +# for sample in data_original: +# results = sample[2:7] +# for placement_idx, county_idx in enumerate(results): +# data[county_idx, placement_idx] += 1 + +# data is table where rows represent placement and columns county index +# 1st | 5 | 1 | 2 | ... +# 2nd | 3 | 0 | 7 | ... +# 3rd ... +# data = np.zeros((5, years * categories_c)) # same as (5, len(data_original)) +# for i, sample in enumerate(data_original): +# results = sample[2:7] +# for j in range(5): +# data[j][i] = results[j] + +# wins per county +# goodness-of-fit problem using Chi Square +# based on observed vs expected frequency +observed = np.zeros(counties_c) for sample in data_original: results = sample[2:7] - for placement_idx, county_idx in enumerate(results): - data[county_idx, placement_idx] += 1 + for i in results: + observed[i] += 1 +expected = np.ones_like(observed) * (sum(observed) / len(observed)) print("Data:") -print(data) +print(observed) +print(expected) + +chi2, p = stats.chisquare(f_obs=observed, f_exp=expected) +print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}") # H0: county and placement are independent # H1: county and placement are not independent -print("\nAttempting Chi-Square test") -chi2, p, dof, expected = stats.chi2_contingency(data) -print(f"Chi-Square Statistic: {chi2}") -print(f"p-value: {p}") -print(f"Degrees of Freedom: {dof}") -#print("Expected Frequencies:\n", expected) +# print("\nAttempting Chi-Square test") +# chi2, p, dof, expected = stats.chi2_contingency(data) -print("\nAttempting Fisher's Exact test") -oddsratio, p_value = stats.fisher_exact(data) +# print(f"Chi-Square Statistic: {chi2}") +# print(f"p-value: {p}") +# print(f"Degrees of Freedom: {dof}") +# print("Expected Frequencies:\n", expected) -print(f"Odds Ratio: {oddsratio}") -print(f"p-value: {p_value}") +# print("\nAttempting Fisher's Exact test") +# oddsratio, p_value = stats.fisher_exact(data) + +# print(f"Odds Ratio: {oddsratio}") +# print(f"p-value: {p_value}")