🧱 Creates new data structure

2025-05-03 20:26:10 +02:00
parent bd4c976608
commit feed2387c8
1 changed files with 53 additions and 24 deletions
--- a/analysis.py
+++ b/analysis.py
@@ -3,23 +3,44 @@ import scipy.stats as stats
 import numpy as np
 import matplotlib.pyplot as plt
-counties = {
+counties = [
-    "BA": 0,
+    "BA",
-    "TN": 1,
+    "TN",
-    "TT": 2,
+    "TT",
-    "NR": 3,
+    "NR",
-    "BB": 4,
+    "BB",
-    "ZA": 5,
+    "ZA",
-    "PO": 6,
+    "PO",
-    "KE": 7
+    "KE"
-}
+]
-counties_k = list(counties.keys())
+counties_c = len(counties)  # how many counties
 categories = [
    "Problematika voľného času",
    "Matematika, fyzika",
    "Chémia, potravinárstvo",
    "Biológia",
    "Životné prostredie, geografia, geológia",
    "Zdravotníctvo, farmakológia",
    "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
    "Cestovný ruch, hotelierstvo, gastronómia",
    "Strojárstvo, hutníctvo, doprava",
    "Stavebníctvo, geodézia, kartografia",
    "Informatika",
    "Elektrotechnika, hardware, mechatronika",
    "História, filozofia, právne vedy",
    "Tvorba učebných pomôcok, didaktické technológie",
    "Ekonomika a riadenie",
    "Teória kultúry, umenie, umelecká, odevná tvorba",
    "Pedagogika, psychológia, sociológia"
 ]
 categories_c = 17  # how many categories
 def map_counties(arr: List[str]) -> List[int]:
    ret = []
    for county in arr:
-        ret.append(counties[county])
+        ret.append(counties.index(county))
    return ret
@@ -38,18 +59,26 @@ with open("dataset.txt") as stream:
 # 0 - year
-# 1 - abteilung (category) id
+# 1 - abteilung (category) id (starts at 1)
 # 2-7 - first to last place county ids
-data = np.array(raw_data)
+data_original = np.array(raw_data)
-print("Testing place distribution for normality by county")
+# table where counties are rows and category-scores are columnes
-for id in range(8):
+#      01 | 02 | 03 | ...
-    places = []
+# BA |  5 |  2 |  1 | ...
-    for sample in data:
+# TT |  0 |  3 |  4 | ...
-        for i, v in enumerate(sample[2:7]):
+# KE |  4 |  1 |  5 | ...
-            if v == id:
+# ...
-               places.append(i)
+# as a row-first 2d numpy array (first dimension will represent counties, second category-scores)
 data = np.zeros((counties_c, categories_c))
 for sample in data_original:
    category_id = sample[1] - 1  # because they start at 1
    results = sample[2:7]
    for i, county_id in enumerate(results):
        # first  -> 5
        # second -> 4
        # ... (formula is 6 - i)
        data[county_id, category_id] += 6 - i
-    # null hypothesis is that the sample comes from a normal distribution
+
-    F, p = stats.normaltest(places)
+print(data)
    print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed")