🧐 Adds parsed data

2025-05-03 17:24:32 +02:00
parent 8e4cac0789
commit 85ba8d9651
5 changed files with 157 additions and 218 deletions
--- a/get_data.py
+++ b/get_data.py
@@ -2,7 +2,6 @@ import re
 import requests
 import urllib.request
 from bs4 import BeautifulSoup
-from pypdf import PdfReader
 from typing import List

 r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
@@ -43,62 +42,6 @@ if ok.lower() == "n":
    exit(0)


-counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
-counties2023 = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj",
-                     "Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"]
-counties2022 = ["Bratislavský", "Trnavský", "Trenčiansky", "Nitriansky",
-                     "Žilinský", "Banskobystrický", "Prešovský", "Košický"]
-
-
-def extract_2024(cat_str: str, lines: List[str]):
-    results = []
-    lines = text.split("\n")
-
-    for i in range(len(lines)):
-        line = lines[i]
-        if not line.startswith(cat_str):
-            continue
-
-        county = lines[i-1][-2:]
-        if county not in counties:
-            continue
-
-        results.append(county)
-        if len(results) == 5:
-            return results
-
-    return results
-
-
-def extract_2023(lines: List[str]):
-    results = []
-
-    for line in lines:
-        if line not in counties2023:
-            continue
-
-        results.append(counties[counties2023.index(line)])
-        if len(results) == 5:
-            return results
-
-    return results
-
-
-def extract_2022(lines: List[str]):
-    results = []
-
-    for line in lines:
-        word = line.strip().split(" ")[-1].strip()
-        if word not in counties2022:
-            continue
-
-        results.append(counties[counties2022.index(word)])
-        if len(results) == 5:
-            return results
-
-    return results
-
-
 for pdf in pdfs:
    year = 0
    cat = 0
@@ -117,34 +60,14 @@ for pdf in pdfs:
    if not (matches := re.findall(category, pdf)):
        print("Coudln't get category, skipping")
        continue
+    # category
    cat = int(max(matches[0]))
    cat_str = f"{cat:>02}"

-    if year != 2022:
-        continue
-
    id = f"{year}-{cat_str}"
    path = f"data/r{id}.pdf"
    print(f"Downloading {id} - '{pdf}' -> '{path}'")
-    urllib.request.urlretrieve(pdf, path)
-
-    reader = PdfReader(path)
-    if len(reader.pages) == 0:
-        print("Coudln't find pages, skipping")
-        continue
-    page = reader.pages[0]
-    
-    text = page.extract_text()
-    lines = text.split("\n")
-    results = []
-    match year:
-        case 2024:
-            results = extract_2024(cat_str, lines)
-        case 2023:
-            results = extract_2023(lines)
-        case 2022:
-            results = extract_2022(lines)
-    print("got results: ", results)
-
-    exit(0)
-
+    try:
+        urllib.request.urlretrieve(pdf, path)
+    except Exception as e:
+        print(e)