from typing import List import scipy.stats as stats import numpy as np import matplotlib.pyplot as plt counties = [ "BA", "TN", "TT", "NR", "BB", "ZA", "PO", "KE" ] counties_c = len(counties) # how many counties counties_population = [ 736_385, # BA 565_900, # TN 565_572, # TT 665_600, # NR 611_124, # BB 686_063, # ZA 810_008, # PO 778_799 # KE ] # source: https://sk.wikipedia.org/wiki/Zoznam_krajov_na_Slovensku total_population = sum(counties_population) categories = [ "Problematika voľného času", "Matematika, fyzika", "Chémia, potravinárstvo", "Biológia", "Životné prostredie, geografia, geológia", "Zdravotníctvo, farmakológia", "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)", "Cestovný ruch, hotelierstvo, gastronómia", "Strojárstvo, hutníctvo, doprava", "Stavebníctvo, geodézia, kartografia", "Informatika", "Elektrotechnika, hardware, mechatronika", "História, filozofia, právne vedy", "Tvorba učebných pomôcok, didaktické technológie", "Ekonomika a riadenie", "Teória kultúry, umenie, umelecká, odevná tvorba", "Pedagogika, psychológia, sociológia" ] categories_c = 17 # how many categories # from how many years do we have data years = 9 def map_counties(arr: List[str]) -> List[int]: ret = [] for county in arr: ret.append(counties.index(county)) return ret raw_data = [] with open("dataset.txt") as stream: for line in stream.readlines(): if not line: continue split = line.strip().split(" ") year = int(split[0]) category = int(split[1]) wins_raw = split[2].split(",") raw_data.append([year, category, *map_counties(wins_raw)]) # 0 - year # 1 - abteilung (category) idx (starts at 1) # 2-7 - first to last place county idxs data_original = np.array(raw_data) # table where counties are rows and counts of placements are columnes # #1 | #2 | ... # BA | 5 | 4 | ... # ZA | 9 | 8 | ... # KE | 4 | 6 | ... # as a row-first 2d numpy array (first dimension will represent counties, second counts of placements) # data = np.zeros((counties_c, 5)) # 5 because top five # for sample in data_original: # results = sample[2:7] # for placement_idx, county_idx in enumerate(results): # data[county_idx, placement_idx] += 1 # data is table where rows represent placement and columns county index # 1st | 5 | 1 | 2 | ... # 2nd | 3 | 0 | 7 | ... # 3rd ... # data = np.zeros((5, years * categories_c)) # same as (5, len(data_original)) # for i, sample in enumerate(data_original): # results = sample[2:7] # for j in range(5): # data[j][i] = results[j] # wins per county # goodness-of-fit problem using Chi Square # based on observed vs expected frequency observed = np.zeros(counties_c) for sample in data_original: results = sample[2:7] for i in results: observed[i] += 1 print("Observed before adjusting for population:") print(observed) # micro-wins per capita (because wins would be a tiny number) for i in range(len(observed)): observed[i] = observed[i] / counties_population[i] * 1_000_000 print("Observed after adjusting for population:") print(observed) expected = np.ones_like(observed) * (sum(observed) / len(observed)) print("Expected after adjusting for population:") print(expected) chi2, p = stats.chisquare(f_obs=observed, f_exp=expected) print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}") # H0: county and placement are independent # H1: county and placement are not independent # print("\nAttempting Chi-Square test") # chi2, p, dof, expected = stats.chi2_contingency(data) # print(f"Chi-Square Statistic: {chi2}") # print(f"p-value: {p}") # print(f"Degrees of Freedom: {dof}") # print("Expected Frequencies:\n", expected) # print("\nAttempting Fisher's Exact test") # oddsratio, p_value = stats.fisher_exact(data) # print(f"Odds Ratio: {oddsratio}") # print(f"p-value: {p_value}")