🧱 Creates new data structure

2025-05-03 20:26:10 +02:00
parent bd4c976608
commit feed2387c8
1 changed files with 53 additions and 24 deletions
--- a/analysis.py
+++ b/analysis.py
@@ -3,23 +3,44 @@ import scipy.stats as stats
 import numpy as np
 import matplotlib.pyplot as plt

-counties = {
-    "BA": 0,
-    "TN": 1,
-    "TT": 2,
-    "NR": 3,
-    "BB": 4,
-    "ZA": 5,
-    "PO": 6,
-    "KE": 7
-}
-counties_k = list(counties.keys())
+counties = [
+    "BA",
+    "TN",
+    "TT",
+    "NR",
+    "BB",
+    "ZA",
+    "PO",
+    "KE"
+]
+counties_c = len(counties)  # how many counties
+
+categories = [
+    "Problematika voľného času",
+    "Matematika, fyzika",
+    "Chémia, potravinárstvo",
+    "Biológia",
+    "Životné prostredie, geografia, geológia",
+    "Zdravotníctvo, farmakológia",
+    "Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
+    "Cestovný ruch, hotelierstvo, gastronómia",
+    "Strojárstvo, hutníctvo, doprava",
+    "Stavebníctvo, geodézia, kartografia",
+    "Informatika",
+    "Elektrotechnika, hardware, mechatronika",
+    "História, filozofia, právne vedy",
+    "Tvorba učebných pomôcok, didaktické technológie",
+    "Ekonomika a riadenie",
+    "Teória kultúry, umenie, umelecká, odevná tvorba",
+    "Pedagogika, psychológia, sociológia"
+]
+categories_c = 17  # how many categories


 def map_counties(arr: List[str]) -> List[int]:
    ret = []
    for county in arr:
-        ret.append(counties[county])
+        ret.append(counties.index(county))
    return ret


@@ -38,18 +59,26 @@ with open("dataset.txt") as stream:


 # 0 - year
-# 1 - abteilung (category) id
+# 1 - abteilung (category) id (starts at 1)
 # 2-7 - first to last place county ids
-data = np.array(raw_data)
+data_original = np.array(raw_data)

-print("Testing place distribution for normality by county")
-for id in range(8):
-    places = []
-    for sample in data:
-        for i, v in enumerate(sample[2:7]):
-            if v == id:
-               places.append(i)
+# table where counties are rows and category-scores are columnes
+#      01 | 02 | 03 | ...
+# BA |  5 |  2 |  1 | ...
+# TT |  0 |  3 |  4 | ...
+# KE |  4 |  1 |  5 | ...
+# ...
+# as a row-first 2d numpy array (first dimension will represent counties, second category-scores)
+data = np.zeros((counties_c, categories_c))
+for sample in data_original:
+    category_id = sample[1] - 1  # because they start at 1
+    results = sample[2:7]
+    for i, county_id in enumerate(results):
+        # first  -> 5
+        # second -> 4
+        # ... (formula is 6 - i)
+        data[county_id, category_id] += 6 - i

-    # null hypothesis is that the sample comes from a normal distribution
-    F, p = stats.normaltest(places)
-    print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed")
+
+print(data)