diff --git a/analysis.py b/analysis.py index cf4937b..9aff396 100644 --- a/analysis.py +++ b/analysis.py @@ -3,23 +3,44 @@ import scipy.stats as stats import numpy as np import matplotlib.pyplot as plt -counties = { - "BA": 0, - "TN": 1, - "TT": 2, - "NR": 3, - "BB": 4, - "ZA": 5, - "PO": 6, - "KE": 7 -} -counties_k = list(counties.keys()) +counties = [ + "BA", + "TN", + "TT", + "NR", + "BB", + "ZA", + "PO", + "KE" +] +counties_c = len(counties) # how many counties + +categories = [ + "Problematika voľného času", + "Matematika, fyzika", + "Chémia, potravinárstvo", + "Biológia", + "Životné prostredie, geografia, geológia", + "Zdravotníctvo, farmakológia", + "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)", + "Cestovný ruch, hotelierstvo, gastronómia", + "Strojárstvo, hutníctvo, doprava", + "Stavebníctvo, geodézia, kartografia", + "Informatika", + "Elektrotechnika, hardware, mechatronika", + "História, filozofia, právne vedy", + "Tvorba učebných pomôcok, didaktické technológie", + "Ekonomika a riadenie", + "Teória kultúry, umenie, umelecká, odevná tvorba", + "Pedagogika, psychológia, sociológia" +] +categories_c = 17 # how many categories def map_counties(arr: List[str]) -> List[int]: ret = [] for county in arr: - ret.append(counties[county]) + ret.append(counties.index(county)) return ret @@ -38,18 +59,26 @@ with open("dataset.txt") as stream: # 0 - year -# 1 - abteilung (category) id +# 1 - abteilung (category) id (starts at 1) # 2-7 - first to last place county ids -data = np.array(raw_data) +data_original = np.array(raw_data) -print("Testing place distribution for normality by county") -for id in range(8): - places = [] - for sample in data: - for i, v in enumerate(sample[2:7]): - if v == id: - places.append(i) +# table where counties are rows and category-scores are columnes +# 01 | 02 | 03 | ... +# BA | 5 | 2 | 1 | ... +# TT | 0 | 3 | 4 | ... +# KE | 4 | 1 | 5 | ... +# ... +# as a row-first 2d numpy array (first dimension will represent counties, second category-scores) +data = np.zeros((counties_c, categories_c)) +for sample in data_original: + category_id = sample[1] - 1 # because they start at 1 + results = sample[2:7] + for i, county_id in enumerate(results): + # first -> 5 + # second -> 4 + # ... (formula is 6 - i) + data[county_id, category_id] += 6 - i - # null hypothesis is that the sample comes from a normal distribution - F, p = stats.normaltest(places) - print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed") + +print(data)