131 lines
3.5 KiB
Python
131 lines
3.5 KiB
Python
from typing import List
|
|
import scipy.stats as stats
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
counties = [
|
|
"BA",
|
|
"TN",
|
|
"TT",
|
|
"NR",
|
|
"BB",
|
|
"ZA",
|
|
"PO",
|
|
"KE"
|
|
]
|
|
counties_c = len(counties) # how many counties
|
|
counties_population = [
|
|
736_385, # BA
|
|
565_900, # TN
|
|
565_572, # TT
|
|
665_600, # NR
|
|
611_124, # BB
|
|
686_063, # ZA
|
|
810_008, # PO
|
|
778_799 # KE
|
|
] # source: https://sk.wikipedia.org/wiki/Zoznam_krajov_na_Slovensku
|
|
total_population = sum(counties_population)
|
|
|
|
categories = [
|
|
"Problematika voľného času",
|
|
"Matematika, fyzika",
|
|
"Chémia, potravinárstvo",
|
|
"Biológia",
|
|
"Životné prostredie, geografia, geológia",
|
|
"Zdravotníctvo, farmakológia",
|
|
"Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
|
|
"Cestovný ruch, hotelierstvo, gastronómia",
|
|
"Strojárstvo, hutníctvo, doprava",
|
|
"Stavebníctvo, geodézia, kartografia",
|
|
"Informatika",
|
|
"Elektrotechnika, hardware, mechatronika",
|
|
"História, filozofia, právne vedy",
|
|
"Tvorba učebných pomôcok, didaktické technológie",
|
|
"Ekonomika a riadenie",
|
|
"Teória kultúry, umenie, umelecká, odevná tvorba",
|
|
"Pedagogika, psychológia, sociológia"
|
|
]
|
|
categories_c = 17 # how many categories
|
|
|
|
# from how many years do we have data
|
|
years = 9
|
|
|
|
|
|
def map_counties(arr: List[str]) -> List[int]:
|
|
ret = []
|
|
for county in arr:
|
|
ret.append(counties.index(county))
|
|
return ret
|
|
|
|
|
|
raw_data = []
|
|
with open("dataset.txt") as stream:
|
|
for line in stream.readlines():
|
|
if not line:
|
|
continue
|
|
|
|
split = line.strip().split(" ")
|
|
year = int(split[0])
|
|
category = int(split[1])
|
|
wins_raw = split[2].split(",")
|
|
|
|
raw_data.append([year, category, *map_counties(wins_raw)])
|
|
|
|
# 0 - year
|
|
# 1 - abteilung (category) idx (starts at 1)
|
|
# 2-7 - first to last place county idxs
|
|
data_original = np.array(raw_data)
|
|
|
|
# ------
|
|
# H0: each county wins proportionally to its population
|
|
# H1: some counties win more than others after adjusting for population
|
|
# wins per county
|
|
# goodness-of-fit problem using chi square
|
|
# based on observed vs expected frequency
|
|
|
|
observed = np.zeros(counties_c)
|
|
for sample in data_original:
|
|
results = sample[2:7]
|
|
for i in results:
|
|
observed[i] += 1
|
|
|
|
print("Observed before adjusting for population:")
|
|
print(observed)
|
|
|
|
# micro-wins per capita (because wins per capita would be a tiny number)
|
|
for i in range(len(observed)):
|
|
observed[i] = observed[i] / counties_population[i] * 1_000_000 # `*million` because unit is micro
|
|
|
|
print("Observed after adjusting for population [micro-wins per capita]:")
|
|
print(observed)
|
|
|
|
expected = np.ones_like(observed) * (sum(observed) / len(observed))
|
|
|
|
print("Expected:")
|
|
print(expected)
|
|
|
|
chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
|
|
print(f"Chi-Square = {chi2:.4f}, p-value = {p:.8f}")
|
|
print("")
|
|
|
|
# ------
|
|
# H0: there is no difference between county placements (once they do get placed)
|
|
# H1: some counties get better placements than others
|
|
# kruskal wallis
|
|
# places grouped by counties
|
|
|
|
# data is table where rows represent county and columns placements
|
|
# BA | 5 | 1 | 4 | ...
|
|
# TN | 2 | 2 | 3 | ...
|
|
# TT ...
|
|
data = []
|
|
for _ in range(8):
|
|
data.append([])
|
|
for sample in data_original:
|
|
results = sample[2:7]
|
|
for i, res in enumerate(results):
|
|
data[res].append(i + 1) # range is [1 - 5]
|
|
|
|
F, p = stats.kruskal(*data)
|
|
print(f"Kruskal-Wallis F = {F:.4f}, p-value = {p:.8f}")
|