state-soc-cross/analysis.py

from typing import List
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

counties = [
    "BA",
    "TN",
    "TT",
    "NR",
    "BB",
    "ZA",
    "PO",
    "KE"
]
counties_c = len(counties)  # how many counties

categories = [
    "Problematika voľného času",
    "Matematika, fyzika",
    "Chémia, potravinárstvo",
    "Biológia",
    "Životné prostredie, geografia, geológia",
    "Zdravotníctvo, farmakológia",
    "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
    "Cestovný ruch, hotelierstvo, gastronómia",
    "Strojárstvo, hutníctvo, doprava",
    "Stavebníctvo, geodézia, kartografia",
    "Informatika",
    "Elektrotechnika, hardware, mechatronika",
    "História, filozofia, právne vedy",
    "Tvorba učebných pomôcok, didaktické technológie",
    "Ekonomika a riadenie",
    "Teória kultúry, umenie, umelecká, odevná tvorba",
    "Pedagogika, psychológia, sociológia"
]
categories_c = 17  # how many categories


def map_counties(arr: List[str]) -> List[int]:
    ret = []
    for county in arr:
        ret.append(counties.index(county))
    return ret


raw_data = []
with open("dataset.txt") as stream:
    for line in stream.readlines():
        if not line:
            continue

        split = line.strip().split(" ")
        year = int(split[0])
        category = int(split[1])
        wins_raw = split[2].split(",")

        raw_data.append([year, category, *map_counties(wins_raw)])


# 0 - year
# 1 - abteilung (category) id (starts at 1)
# 2-7 - first to last place county ids
data_original = np.array(raw_data)

# table where counties are rows and category-scores are columnes
#      01 | 02 | 03 | ...
# BA |  5 |  2 |  1 | ...
# TT |  0 |  3 |  4 | ...
# KE |  4 |  1 |  5 | ...
# ...
# as a row-first 2d numpy array (first dimension will represent counties, second category-scores)
data = np.zeros((counties_c, categories_c))
for sample in data_original:
    category_id = sample[1] - 1  # because they start at 1
    results = sample[2:7]
    for i, county_id in enumerate(results):
        # first  -> 5
        # second -> 4
        # ... (formula is 6 - i)
        data[county_id, category_id] += 6 - i


print(data)