🧱 Creates new data structure
This commit is contained in:
parent
bd4c976608
commit
feed2387c8
77
analysis.py
77
analysis.py
@ -3,23 +3,44 @@ import scipy.stats as stats
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
counties = {
|
||||
"BA": 0,
|
||||
"TN": 1,
|
||||
"TT": 2,
|
||||
"NR": 3,
|
||||
"BB": 4,
|
||||
"ZA": 5,
|
||||
"PO": 6,
|
||||
"KE": 7
|
||||
}
|
||||
counties_k = list(counties.keys())
|
||||
counties = [
|
||||
"BA",
|
||||
"TN",
|
||||
"TT",
|
||||
"NR",
|
||||
"BB",
|
||||
"ZA",
|
||||
"PO",
|
||||
"KE"
|
||||
]
|
||||
counties_c = len(counties) # how many counties
|
||||
|
||||
categories = [
|
||||
"Problematika voľného času",
|
||||
"Matematika, fyzika",
|
||||
"Chémia, potravinárstvo",
|
||||
"Biológia",
|
||||
"Životné prostredie, geografia, geológia",
|
||||
"Zdravotníctvo, farmakológia",
|
||||
"Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
|
||||
"Cestovný ruch, hotelierstvo, gastronómia",
|
||||
"Strojárstvo, hutníctvo, doprava",
|
||||
"Stavebníctvo, geodézia, kartografia",
|
||||
"Informatika",
|
||||
"Elektrotechnika, hardware, mechatronika",
|
||||
"História, filozofia, právne vedy",
|
||||
"Tvorba učebných pomôcok, didaktické technológie",
|
||||
"Ekonomika a riadenie",
|
||||
"Teória kultúry, umenie, umelecká, odevná tvorba",
|
||||
"Pedagogika, psychológia, sociológia"
|
||||
]
|
||||
categories_c = 17 # how many categories
|
||||
|
||||
|
||||
def map_counties(arr: List[str]) -> List[int]:
|
||||
ret = []
|
||||
for county in arr:
|
||||
ret.append(counties[county])
|
||||
ret.append(counties.index(county))
|
||||
return ret
|
||||
|
||||
|
||||
@ -38,18 +59,26 @@ with open("dataset.txt") as stream:
|
||||
|
||||
|
||||
# 0 - year
|
||||
# 1 - abteilung (category) id
|
||||
# 1 - abteilung (category) id (starts at 1)
|
||||
# 2-7 - first to last place county ids
|
||||
data = np.array(raw_data)
|
||||
data_original = np.array(raw_data)
|
||||
|
||||
print("Testing place distribution for normality by county")
|
||||
for id in range(8):
|
||||
places = []
|
||||
for sample in data:
|
||||
for i, v in enumerate(sample[2:7]):
|
||||
if v == id:
|
||||
places.append(i)
|
||||
# table where counties are rows and category-scores are columnes
|
||||
# 01 | 02 | 03 | ...
|
||||
# BA | 5 | 2 | 1 | ...
|
||||
# TT | 0 | 3 | 4 | ...
|
||||
# KE | 4 | 1 | 5 | ...
|
||||
# ...
|
||||
# as a row-first 2d numpy array (first dimension will represent counties, second category-scores)
|
||||
data = np.zeros((counties_c, categories_c))
|
||||
for sample in data_original:
|
||||
category_id = sample[1] - 1 # because they start at 1
|
||||
results = sample[2:7]
|
||||
for i, county_id in enumerate(results):
|
||||
# first -> 5
|
||||
# second -> 4
|
||||
# ... (formula is 6 - i)
|
||||
data[county_id, category_id] += 6 - i
|
||||
|
||||
# null hypothesis is that the sample comes from a normal distribution
|
||||
F, p = stats.normaltest(places)
|
||||
print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed")
|
||||
|
||||
print(data)
|
||||
|
Loading…
x
Reference in New Issue
Block a user