from typing import List import scipy.stats as stats import numpy as np import matplotlib.pyplot as plt counties = [ "BA", "TN", "TT", "NR", "BB", "ZA", "PO", "KE" ] counties_c = len(counties) # how many counties categories = [ "Problematika voľného času", "Matematika, fyzika", "Chémia, potravinárstvo", "Biológia", "Životné prostredie, geografia, geológia", "Zdravotníctvo, farmakológia", "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)", "Cestovný ruch, hotelierstvo, gastronómia", "Strojárstvo, hutníctvo, doprava", "Stavebníctvo, geodézia, kartografia", "Informatika", "Elektrotechnika, hardware, mechatronika", "História, filozofia, právne vedy", "Tvorba učebných pomôcok, didaktické technológie", "Ekonomika a riadenie", "Teória kultúry, umenie, umelecká, odevná tvorba", "Pedagogika, psychológia, sociológia" ] categories_c = 17 # how many categories def map_counties(arr: List[str]) -> List[int]: ret = [] for county in arr: ret.append(counties.index(county)) return ret raw_data = [] with open("dataset.txt") as stream: for line in stream.readlines(): if not line: continue split = line.strip().split(" ") year = int(split[0]) category = int(split[1]) wins_raw = split[2].split(",") raw_data.append([year, category, *map_counties(wins_raw)]) # 0 - year # 1 - abteilung (category) idx (starts at 1) # 2-7 - first to last place county idxs data_original = np.array(raw_data) # table where counties are rows and counts of placements are columnes # #1 | #2 | ... # BA | 5 | 4 | ... # ZA | 9 | 8 | ... # KE | 4 | 6 | ... # as a row-first 2d numpy array (first dimension will represent counties, second counts of placements) data = np.zeros((counties_c, 5)) # 5 because top five for sample in data_original: results = sample[2:7] for placement_idx, county_idx in enumerate(results): data[county_idx, placement_idx] += 1 print("Data:") print(data) # H0: county and placement are independent # H1: county and placement are not independent print("\nAttempting Chi-Square test") chi2, p, dof, expected = stats.chi2_contingency(data) print(f"Chi-Square Statistic: {chi2}") print(f"p-value: {p}") print(f"Degrees of Freedom: {dof}") #print("Expected Frequencies:\n", expected) print("\nAttempting Fisher's Exact test") oddsratio, p_value = stats.fisher_exact(data) print(f"Odds Ratio: {oddsratio}") print(f"p-value: {p_value}")