from typing import List import scipy.stats as stats import numpy as np import matplotlib.pyplot as plt counties = [ "BA", "TN", "TT", "NR", "BB", "ZA", "PO", "KE" ] counties_c = len(counties) # how many counties counties_population = [ 736_385, # BA 565_900, # TN 565_572, # TT 665_600, # NR 611_124, # BB 686_063, # ZA 810_008, # PO 778_799 # KE ] # source: https://sk.wikipedia.org/wiki/Zoznam_krajov_na_Slovensku total_population = sum(counties_population) categories = [ "Problematika voľného času", "Matematika, fyzika", "Chémia, potravinárstvo", "Biológia", "Životné prostredie, geografia, geológia", "Zdravotníctvo, farmakológia", "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)", "Cestovný ruch, hotelierstvo, gastronómia", "Strojárstvo, hutníctvo, doprava", "Stavebníctvo, geodézia, kartografia", "Informatika", "Elektrotechnika, hardware, mechatronika", "História, filozofia, právne vedy", "Tvorba učebných pomôcok, didaktické technológie", "Ekonomika a riadenie", "Teória kultúry, umenie, umelecká, odevná tvorba", "Pedagogika, psychológia, sociológia" ] categories_c = 17 # how many categories # from how many years do we have data years = 9 def map_counties(arr: List[str]) -> List[int]: ret = [] for county in arr: ret.append(counties.index(county)) return ret raw_data = [] with open("dataset.txt") as stream: for line in stream.readlines(): if not line: continue split = line.strip().split(" ") year = int(split[0]) category = int(split[1]) wins_raw = split[2].split(",") raw_data.append([year, category, *map_counties(wins_raw)]) # 0 - year # 1 - abteilung (category) idx (starts at 1) # 2-7 - first to last place county idxs data_original = np.array(raw_data) # ------ # H0: each county wins proportionally to its population # H1: some counties win more than others after adjusting for population # wins per county # goodness-of-fit problem using chi square # based on observed vs expected frequency observed = np.zeros(counties_c) for sample in data_original: results = sample[2:7] for i in results: observed[i] += 1 print("Observed before adjusting for population:") print(observed) # micro-wins per capita (because wins per capita would be a tiny number) for i in range(len(observed)): observed[i] = observed[i] / counties_population[i] * 1_000_000 # `*million` because unit is micro print("Observed after adjusting for population [micro-wins per capita]:") print(observed) expected = np.ones_like(observed) * (sum(observed) / len(observed)) print("Expected:") print(expected) chi2, p = stats.chisquare(f_obs=observed, f_exp=expected) print(f"Chi-Square = {chi2:.4f}, p-value = {p:.8f}") print("") # ------ # H0: there is no difference between county placements (once they do get placed) # H1: some counties get better placements than others # kruskal wallis # places grouped by counties # data is table where rows represent county and columns placements # BA | 5 | 1 | 4 | ... # TN | 2 | 2 | 3 | ... # TT ... data = [] for _ in range(8): data.append([]) for sample in data_original: results = sample[2:7] for i, res in enumerate(results): data[res].append(i + 1) # range is [1 - 5] F, p = stats.kruskal(*data) print(f"Kruskal-Wallis F = {F:.4f}, p-value = {p:.8f}")