state-soc-cross/find_pdfs.py

import re
import requests
from bs4 import BeautifulSoup

r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a")

year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)"

for link in links:
    if ".pdf" not in link.get("href"):
        continue

    href = link.get("href")
    if not ("2024/04/" in href or \
            "2023/06/" in href or \
            "2022/05/" in href or \
            ("2021/05/" in href and "Vysled" in href) or \
            ("2020/05/" in href and "Odbor" in href) or \
            ("2019/05/" in href and "vysled" in href) or \
            ("2019/02/" in href and "Vysled" in href)
            ):
        continue

    if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
        href = "https://siov.sk" + href
    print(href)