29 lines
882 B
Python
29 lines
882 B
Python
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
|
|
soup = BeautifulSoup(r.content, "html.parser")
|
|
links = soup.find_all("a")
|
|
|
|
year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)"
|
|
|
|
for link in links:
|
|
if ".pdf" not in link.get("href"):
|
|
continue
|
|
|
|
href = link.get("href")
|
|
if not ("2024/04/" in href or \
|
|
"2023/06/" in href or \
|
|
"2022/05/" in href or \
|
|
("2021/05/" in href and "Vysled" in href) or \
|
|
("2020/05/" in href and "Odbor" in href) or \
|
|
("2019/05/" in href and "vysled" in href) or \
|
|
("2019/02/" in href and "Vysled" in href)
|
|
):
|
|
continue
|
|
|
|
if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
|
|
href = "https://siov.sk" + href
|
|
print(href)
|