From 8e4cac078904749db02be4125d4cfb324dfd607b Mon Sep 17 00:00:00 2001 From: Daniel Svitan Date: Fri, 18 Apr 2025 19:33:26 +0200 Subject: [PATCH] :zap: Adds parsing 2022 pdf results --- get_data.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/get_data.py b/get_data.py index e371c65..8c84a58 100644 --- a/get_data.py +++ b/get_data.py @@ -44,8 +44,10 @@ if ok.lower() == "n": counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"] -semifull_counties = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj", +counties2023 = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj", "Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"] +counties2022 = ["Bratislavský", "Trnavský", "Trenčiansky", "Nitriansky", + "Žilinský", "Banskobystrický", "Prešovský", "Košický"] def extract_2024(cat_str: str, lines: List[str]): @@ -71,16 +73,26 @@ def extract_2024(cat_str: str, lines: List[str]): def extract_2023(lines: List[str]): results = [] - for i in range(len(lines)): - line = lines[i] - if line[0:2] not in ["2.", "3.", "4.", "5.", "6."]: + for line in lines: + if line not in counties2023: continue - if lines[i-1] not in semifull_counties: - print("County not recognized: ", lines[i-1]) + results.append(counties[counties2023.index(line)]) + if len(results) == 5: + return results + + return results + + +def extract_2022(lines: List[str]): + results = [] + + for line in lines: + word = line.strip().split(" ")[-1].strip() + if word not in counties2022: continue - results.append(counties[semifull_counties.index(lines[i-1])]) + results.append(counties[counties2022.index(word)]) if len(results) == 5: return results @@ -108,7 +120,7 @@ for pdf in pdfs: cat = int(max(matches[0])) cat_str = f"{cat:>02}" - if year != 2023: + if year != 2022: continue id = f"{year}-{cat_str}" @@ -130,6 +142,8 @@ for pdf in pdfs: results = extract_2024(cat_str, lines) case 2023: results = extract_2023(lines) + case 2022: + results = extract_2022(lines) print("got results: ", results) exit(0)