diff --git a/analysis.py b/analysis.py index 9aff396..e616eec 100644 --- a/analysis.py +++ b/analysis.py @@ -59,26 +59,38 @@ with open("dataset.txt") as stream: # 0 - year -# 1 - abteilung (category) id (starts at 1) -# 2-7 - first to last place county ids +# 1 - abteilung (category) idx (starts at 1) +# 2-7 - first to last place county idxs data_original = np.array(raw_data) -# table where counties are rows and category-scores are columnes -# 01 | 02 | 03 | ... -# BA | 5 | 2 | 1 | ... -# TT | 0 | 3 | 4 | ... -# KE | 4 | 1 | 5 | ... -# ... -# as a row-first 2d numpy array (first dimension will represent counties, second category-scores) -data = np.zeros((counties_c, categories_c)) +# table where counties are rows and counts of placements are columnes +# #1 | #2 | ... +# BA | 5 | 4 | ... +# ZA | 9 | 8 | ... +# KE | 4 | 6 | ... +# as a row-first 2d numpy array (first dimension will represent counties, second counts of placements) +data = np.zeros((counties_c, 5)) # 5 because top five for sample in data_original: - category_id = sample[1] - 1 # because they start at 1 results = sample[2:7] - for i, county_id in enumerate(results): - # first -> 5 - # second -> 4 - # ... (formula is 6 - i) - data[county_id, category_id] += 6 - i + for placement_idx, county_idx in enumerate(results): + data[county_idx, placement_idx] += 1 +print("Data:") print(data) + +# H0: county and placement are independent +# H1: county and placement are not independent +print("\nAttempting Chi-Square test") +chi2, p, dof, expected = stats.chi2_contingency(data) + +print(f"Chi-Square Statistic: {chi2}") +print(f"p-value: {p}") +print(f"Degrees of Freedom: {dof}") +#print("Expected Frequencies:\n", expected) + +print("\nAttempting Fisher's Exact test") +oddsratio, p_value = stats.fisher_exact(data) + +print(f"Odds Ratio: {oddsratio}") +print(f"p-value: {p_value}")