import re import requests import urllib.request from bs4 import BeautifulSoup from pypdf import PdfReader from typing import List r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/") soup = BeautifulSoup(r.content, "html.parser") links = soup.find_all("a") year202x = r"(?<=\/uploads\/)202\d(?=\/)" year2019 = r"2019\/05" year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)" year2017 = r"SOC-2017-odbor" category = r"((?02}" if year != 2023: continue id = f"{year}-{cat_str}" path = f"data/r{id}.pdf" print(f"Downloading {id} - '{pdf}' -> '{path}'") urllib.request.urlretrieve(pdf, path) reader = PdfReader(path) if len(reader.pages) == 0: print("Coudln't find pages, skipping") continue page = reader.pages[0] text = page.extract_text() lines = text.split("\n") results = [] match year: case 2024: results = extract_2024(cat_str, lines) case 2023: results = extract_2023(lines) print("got results: ", results) exit(0)