🧱 Creates new data structure

This commit is contained in:
Daniel Svitan 2025-05-03 20:26:10 +02:00
parent bd4c976608
commit feed2387c8

View File

@ -3,23 +3,44 @@ import scipy.stats as stats
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
counties = { counties = [
"BA": 0, "BA",
"TN": 1, "TN",
"TT": 2, "TT",
"NR": 3, "NR",
"BB": 4, "BB",
"ZA": 5, "ZA",
"PO": 6, "PO",
"KE": 7 "KE"
} ]
counties_k = list(counties.keys()) counties_c = len(counties) # how many counties
categories = [
"Problematika voľného času",
"Matematika, fyzika",
"Chémia, potravinárstvo",
"Biológia",
"Životné prostredie, geografia, geológia",
"Zdravotníctvo, farmakológia",
"Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
"Cestovný ruch, hotelierstvo, gastronómia",
"Strojárstvo, hutníctvo, doprava",
"Stavebníctvo, geodézia, kartografia",
"Informatika",
"Elektrotechnika, hardware, mechatronika",
"História, filozofia, právne vedy",
"Tvorba učebných pomôcok, didaktické technológie",
"Ekonomika a riadenie",
"Teória kultúry, umenie, umelecká, odevná tvorba",
"Pedagogika, psychológia, sociológia"
]
categories_c = 17 # how many categories
def map_counties(arr: List[str]) -> List[int]: def map_counties(arr: List[str]) -> List[int]:
ret = [] ret = []
for county in arr: for county in arr:
ret.append(counties[county]) ret.append(counties.index(county))
return ret return ret
@ -38,18 +59,26 @@ with open("dataset.txt") as stream:
# 0 - year # 0 - year
# 1 - abteilung (category) id # 1 - abteilung (category) id (starts at 1)
# 2-7 - first to last place county ids # 2-7 - first to last place county ids
data = np.array(raw_data) data_original = np.array(raw_data)
print("Testing place distribution for normality by county") # table where counties are rows and category-scores are columnes
for id in range(8): # 01 | 02 | 03 | ...
places = [] # BA | 5 | 2 | 1 | ...
for sample in data: # TT | 0 | 3 | 4 | ...
for i, v in enumerate(sample[2:7]): # KE | 4 | 1 | 5 | ...
if v == id: # ...
places.append(i) # as a row-first 2d numpy array (first dimension will represent counties, second category-scores)
data = np.zeros((counties_c, categories_c))
for sample in data_original:
category_id = sample[1] - 1 # because they start at 1
results = sample[2:7]
for i, county_id in enumerate(results):
# first -> 5
# second -> 4
# ... (formula is 6 - i)
data[county_id, category_id] += 6 - i
# null hypothesis is that the sample comes from a normal distribution
F, p = stats.normaltest(places) print(data)
print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed")