state-soc-cross/analysis.py
2025-05-21 19:49:10 +02:00

144 lines
4.0 KiB
Python

from typing import List
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
counties = [
"BA",
"TN",
"TT",
"NR",
"BB",
"ZA",
"PO",
"KE"
]
counties_c = len(counties) # how many counties
counties_population = [
736_385, # BA
565_900, # TN
565_572, # TT
665_600, # NR
611_124, # BB
686_063, # ZA
810_008, # PO
778_799 # KE
] # source: https://sk.wikipedia.org/wiki/Zoznam_krajov_na_Slovensku
total_population = sum(counties_population)
categories = [
"Problematika voľného času",
"Matematika, fyzika",
"Chémia, potravinárstvo",
"Biológia",
"Životné prostredie, geografia, geológia",
"Zdravotníctvo, farmakológia",
"Pôdohospodárstvo (poľnohospodárstvo, lesné a vodné hospodárstvo)",
"Cestovný ruch, hotelierstvo, gastronómia",
"Strojárstvo, hutníctvo, doprava",
"Stavebníctvo, geodézia, kartografia",
"Informatika",
"Elektrotechnika, hardware, mechatronika",
"História, filozofia, právne vedy",
"Tvorba učebných pomôcok, didaktické technológie",
"Ekonomika a riadenie",
"Teória kultúry, umenie, umelecká, odevná tvorba",
"Pedagogika, psychológia, sociológia"
]
categories_c = 17 # how many categories
# from how many years do we have data
years = 9
def map_counties(arr: List[str]) -> List[int]:
ret = []
for county in arr:
ret.append(counties.index(county))
return ret
raw_data = []
with open("dataset.txt") as stream:
for line in stream.readlines():
if not line:
continue
split = line.strip().split(" ")
year = int(split[0])
category = int(split[1])
wins_raw = split[2].split(",")
raw_data.append([year, category, *map_counties(wins_raw)])
# 0 - year
# 1 - abteilung (category) idx (starts at 1)
# 2-7 - first to last place county idxs
data_original = np.array(raw_data)
# table where counties are rows and counts of placements are columnes
# #1 | #2 | ...
# BA | 5 | 4 | ...
# ZA | 9 | 8 | ...
# KE | 4 | 6 | ...
# as a row-first 2d numpy array (first dimension will represent counties, second counts of placements)
# data = np.zeros((counties_c, 5)) # 5 because top five
# for sample in data_original:
# results = sample[2:7]
# for placement_idx, county_idx in enumerate(results):
# data[county_idx, placement_idx] += 1
# data is table where rows represent placement and columns county index
# 1st | 5 | 1 | 2 | ...
# 2nd | 3 | 0 | 7 | ...
# 3rd ...
# data = np.zeros((5, years * categories_c)) # same as (5, len(data_original))
# for i, sample in enumerate(data_original):
# results = sample[2:7]
# for j in range(5):
# data[j][i] = results[j]
# wins per county
# goodness-of-fit problem using Chi Square
# based on observed vs expected frequency
observed = np.zeros(counties_c)
for sample in data_original:
results = sample[2:7]
for i in results:
observed[i] += 1
print("Observed before adjusting for population:")
print(observed)
# micro-wins per capita (because wins would be a tiny number)
for i in range(len(observed)):
observed[i] = observed[i] / counties_population[i] * 1_000_000
print("Observed after adjusting for population:")
print(observed)
expected = np.ones_like(observed) * (sum(observed) / len(observed))
print("Expected after adjusting for population:")
print(expected)
chi2, p = stats.chisquare(f_obs=observed, f_exp=expected)
print(f"Chi-square = {chi2:.2f}, p-value = {p:.4f}")
# H0: county and placement are independent
# H1: county and placement are not independent
# print("\nAttempting Chi-Square test")
# chi2, p, dof, expected = stats.chi2_contingency(data)
# print(f"Chi-Square Statistic: {chi2}")
# print(f"p-value: {p}")
# print(f"Degrees of Freedom: {dof}")
# print("Expected Frequencies:\n", expected)
# print("\nAttempting Fisher's Exact test")
# oddsratio, p_value = stats.fisher_exact(data)
# print(f"Odds Ratio: {oddsratio}")
# print(f"p-value: {p_value}")