from typing import List import scipy.stats as stats import numpy as np import matplotlib.pyplot as plt counties = [ "BA", "TN", "TT", "NR", "BB", "ZA", "PO", "KE" ] counties_c = len(counties) # how many counties categories = [ "Problematika voľného času", "Matematika, fyzika", "Chémia, potravinárstvo", "Biológia", "Životné prostredie, geografia, geológia", "Zdravotníctvo, farmakológia", "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)", "Cestovný ruch, hotelierstvo, gastronómia", "Strojárstvo, hutníctvo, doprava", "Stavebníctvo, geodézia, kartografia", "Informatika", "Elektrotechnika, hardware, mechatronika", "História, filozofia, právne vedy", "Tvorba učebných pomôcok, didaktické technológie", "Ekonomika a riadenie", "Teória kultúry, umenie, umelecká, odevná tvorba", "Pedagogika, psychológia, sociológia" ] categories_c = 17 # how many categories def map_counties(arr: List[str]) -> List[int]: ret = [] for county in arr: ret.append(counties.index(county)) return ret raw_data = [] with open("dataset.txt") as stream: for line in stream.readlines(): if not line: continue split = line.strip().split(" ") year = int(split[0]) category = int(split[1]) wins_raw = split[2].split(",") raw_data.append([year, category, *map_counties(wins_raw)]) # 0 - year # 1 - abteilung (category) id (starts at 1) # 2-7 - first to last place county ids data_original = np.array(raw_data) # table where counties are rows and category-scores are columnes # 01 | 02 | 03 | ... # BA | 5 | 2 | 1 | ... # TT | 0 | 3 | 4 | ... # KE | 4 | 1 | 5 | ... # ... # as a row-first 2d numpy array (first dimension will represent counties, second category-scores) data = np.zeros((counties_c, categories_c)) for sample in data_original: category_id = sample[1] - 1 # because they start at 1 results = sample[2:7] for i, county_id in enumerate(results): # first -> 5 # second -> 4 # ... (formula is 6 - i) data[county_id, category_id] += 6 - i print(data)