import re import requests from bs4 import BeautifulSoup r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/") soup = BeautifulSoup(r.content, "html.parser") links = soup.find_all("a") year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)" for link in links: if ".pdf" not in link.get("href"): continue href = link.get("href") if not ("2024/04/" in href or \ "2023/06/" in href or \ "2022/05/" in href or \ ("2021/05/" in href and "Vysled" in href) or \ ("2020/05/" in href and "Odbor" in href) or \ ("2019/05/" in href and "vysled" in href) or \ ("2019/02/" in href and "Vysled" in href) ): continue if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"): href = "https://siov.sk" + href print(href)