From ef07162c0502ae976ad9e06b928996ff00f4a7c4 Mon Sep 17 00:00:00 2001 From: Daniel Svitan Date: Sat, 3 May 2025 18:02:45 +0200 Subject: [PATCH] :fire: Fixes numpy not loading dataset --- analysis.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ parsed.txt | 13 ------------ requirements.txt | 3 +++ 3 files changed, 58 insertions(+), 13 deletions(-) create mode 100644 analysis.py diff --git a/analysis.py b/analysis.py new file mode 100644 index 0000000..cbfa05a --- /dev/null +++ b/analysis.py @@ -0,0 +1,55 @@ +from typing import List +import scipy.stats as stats +import numpy as np +import matplotlib.pyplot as plt + +counties = { + "BA": 0, + "TN": 1, + "TT": 2, + "NR": 3, + "BB": 4, + "ZA": 5, + "PO": 6, + "KE": 7 +} +counties_k = list(counties.keys()) + + +def map_counties(arr: List[str]) -> List[int]: + ret = [] + for county in arr: + ret.append(counties[county]) + return ret + + +raw_data = [] +with open("parsed.txt") as stream: + for line in stream.readlines(): + if not line: + continue + + split = line.strip().split(" ") + year = int(split[0]) + category = int(split[1]) + wins_raw = split[2].split(",") + + raw_data.append([year, category, *map_counties(wins_raw)]) + + +# 0 - year +# 1 - abteilung id +# 2-7 - first to last place county ids +data = np.array(raw_data) + +print("Testing place distribution for normality by county") +for id in range(8): + places = [] + for sample in data: + for i, v in enumerate(sample[2:7]): + if v == id: + places.append(i) + + # null hypothesis is that the sample comes from a normal distribution + F, p = stats.normaltest(places) + print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed") diff --git a/parsed.txt b/parsed.txt index 985283e..888f37e 100644 --- a/parsed.txt +++ b/parsed.txt @@ -134,16 +134,3 @@ 2024 15 BA,KE,ZA,PO,ZA 2024 16 PO,ZA,NR,BB,KE 2024 17 BB,PO,ZA,PO,TT - - - - - -BA - Bratislavsky -TN - Trnavsky -TT - Trencinsky -NR - Nitriansky -BB - Banskobystricky -ZA - Zilinsky -KE - Kosicky -PO - Presovsky \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1190bd8..84fefef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ requests beautifulsoup4 +numpy +matplotlib +scipy