state-soc-cross/analysis.py

from typing import List
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

counties = [
    "BA",
    "TN",
    "TT",
    "NR",
    "BB",
    "ZA",
    "PO",
    "KE"
]
counties_c = len(counties)  # how many counties
counties_population = [
    736_385,  # BA
    565_900,  # TN
    565_572,  # TT
    665_600,  # NR
    611_124,  # BB
    686_063,  # ZA
    810_008,  # PO
    778_799  # KE
]  # source: https://sk.wikipedia.org/wiki/Zoznam_krajov_na_Slovensku
total_population = sum(counties_population)

categories = [
    "Problematika voľného času",
    "Matematika, fyzika",
    "Chémia, potravinárstvo",
    "Biológia",
    "Životné prostredie, geografia, geológia",
    "Zdravotníctvo, farmakológia",
    "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
    "Cestovný ruch, hotelierstvo, gastronómia",
    "Strojárstvo, hutníctvo, doprava",
    "Stavebníctvo, geodézia, kartografia",
    "Informatika",
    "Elektrotechnika, hardware, mechatronika",
    "História, filozofia, právne vedy",
    "Tvorba učebných pomôcok, didaktické technológie",
    "Ekonomika a riadenie",
    "Teória kultúry, umenie, umelecká, odevná tvorba",
    "Pedagogika, psychológia, sociológia"
]
categories_c = 17  # how many categories

# from how many years do we have data
years = 9


def map_counties(arr: List[str]) -> List[int]:
    ret = []
    for county in arr:
        ret.append(counties.index(county))
    return ret


raw_data = []
with open("dataset.txt") as stream:
    for line in stream.readlines():
        if not line:
            continue

        split = line.strip().split(" ")
        year = int(split[0])
        category = int(split[1])
        wins_raw = split[2].split(",")

        raw_data.append([year, category, *map_counties(wins_raw)])

# 0 - year
# 1 - abteilung (category) idx (starts at 1)
# 2-7 - first to last place county idxs
data_original = np.array(raw_data)

# ------
# H0: each county wins proportionally to its population
# H1: some counties win more than others after adjusting for population
# wins per county
# goodness-of-fit problem using chi square
# based on observed vs expected frequency

observed = np.zeros(counties_c)
for sample in data_original:
    results = sample[2:7]
    for i in results:
        observed[i] += 1

print("Observed before adjusting for population:")
print(observed)

# micro-wins per capita (because wins per capita would be a tiny number)
for i in range(len(observed)):
    observed[i] = observed[i] / counties_population[i] * 1_000_000  # `*million` because unit is micro

print("Observed after adjusting for population [micro-wins per capita]:")
print(observed)

expected = np.ones_like(observed) * (sum(observed) / len(observed))

print("Expected:")
print(expected)

chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
print(f"Chi-Square = {chi2:.4f}, p-value = {p:.8f}")
print("")

# ------
# H0: there is no difference between county placements (once they do get placed)
# H1: some counties get better placements than others
# kruskal wallis
# places grouped by counties

# data is table where rows represent county and columns placements
# BA | 5 | 1 | 4 | ...
# TN | 2 | 2 | 3 | ...
# TT ...
data = []
for _ in range(8):
    data.append([])
for sample in data_original:
    results = sample[2:7]
    for i, res in enumerate(results):
        data[res].append(i + 1)  # range is [1 - 5]

F, p = stats.kruskal(*data)
print(f"Kruskal-Wallis F = {F:.4f}, p-value = {p:.8f}")