state-soc-cross/find_pdfs.py
Daniel Svitan f06cfcfdf2 🎉 Initial commit
2025-04-18 17:09:50 +02:00

29 lines
882 B
Python

import re
import requests
from bs4 import BeautifulSoup
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a")
year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)"
for link in links:
if ".pdf" not in link.get("href"):
continue
href = link.get("href")
if not ("2024/04/" in href or \
"2023/06/" in href or \
"2022/05/" in href or \
("2021/05/" in href and "Vysled" in href) or \
("2020/05/" in href and "Odbor" in href) or \
("2019/05/" in href and "vysled" in href) or \
("2019/02/" in href and "Vysled" in href)
):
continue
if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
href = "https://siov.sk" + href
print(href)