state-soc-cross/get_data.py

import re
import requests
import urllib.request
from bs4 import BeautifulSoup
from pypdf import PdfReader
from typing import List

r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a")

year202x = r"(?<=\/uploads\/)202\d(?=\/)"
year2019 = r"2019\/05"
year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)"
year2017 = r"SOC-2017-odbor"
category = r"((?<!20)[01]\d((?=-)|(?=\.pdf)))|((?<=\/)[01]\d(?=Vysled))|((?<=\.)\d(?=\.pdf))|(2(?=-final))"  # don't ask, it works

pdfs = []
for link in links:
    if ".pdf" not in link.get("href"):
        continue

    href = link.get("href")
    if not ("2024/04/" in href or \
            ("2023/06/" in href and "Pisomne" not in href) or \
            "2022/05/" in href or \
            ("2021/05/" in href and "Vysled" in href) or \
            ("2020/05/" in href and "Odbor" in href) or \
            ("2019/05/" in href and "vysled" in href) or \
            ("2019/02/" in href and "Vysled" in href)
            ):
        continue

    if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
        href = "https://siov.sk" + href
    pdfs.append(href)


print(f"Found {len(pdfs)} pdfs")
ok = input("Continue? [Y/n] ")
if ok.lower() == "n":
    print("okay, bye")
    exit(0)


counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
semifull_counties = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj",
                     "Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"]


def extract_2024(cat_str: str, lines: List[str]):
    results = []
    lines = text.split("\n")

    for i in range(len(lines)):
        line = lines[i]
        if not line.startswith(cat_str):
            continue

        county = lines[i-1][-2:]
        if county not in counties:
            continue

        results.append(county)
        if len(results) == 5:
            return results

    return results


def extract_2023(lines: List[str]):
    results = []

    for i in range(len(lines)):
        line = lines[i]
        if line[0:2] not in ["2.", "3.", "4.", "5.", "6."]:
            continue

        if lines[i-1] not in semifull_counties:
            print("County not recognized: ", lines[i-1])
            continue

        results.append(counties[semifull_counties.index(lines[i-1])])
        if len(results) == 5:
            return results

    return results


for pdf in pdfs:
    year = 0
    cat = 0
    if matches := re.findall(year202x, pdf):
        year = int(matches[0])
    elif re.findall(year2019, pdf):
        year = 2019
    elif re.findall(year2018, pdf):
        year = 2018
    elif re.findall(year2017, pdf):
        year = 2017
    else:
        print("Couldn't get year, skipping")
        continue

    if not (matches := re.findall(category, pdf)):
        print("Coudln't get category, skipping")
        continue
    cat = int(max(matches[0]))
    cat_str = f"{cat:>02}"

    if year != 2023:
        continue

    id = f"{year}-{cat_str}"
    path = f"data/r{id}.pdf"
    print(f"Downloading {id} - '{pdf}' -> '{path}'")
    urllib.request.urlretrieve(pdf, path)

    reader = PdfReader(path)
    if len(reader.pages) == 0:
        print("Coudln't find pages, skipping")
        continue
    page = reader.pages[0]

    text = page.extract_text()
    lines = text.split("\n")
    results = []
    match year:
        case 2024:
            results = extract_2024(cat_str, lines)
        case 2023:
            results = extract_2023(lines)
    print("got results: ", results)

    exit(0)