import re import requests import urllib.request from bs4 import BeautifulSoup from pypdf import PdfReader r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/") soup = BeautifulSoup(r.content, "html.parser") links = soup.find_all("a") year202x = r"(?<=\/uploads\/)202\d(?=\/)" year2019 = r"2019\/05" year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)" year2017 = r"SOC-2017-odbor" category = r"((?02}" path = f"data/r{id}.pdf" print(f"Downloading {id} - '{pdf}' -> '{path}'") urllib.request.urlretrieve(pdf, path) reader = PdfReader(path) if len(reader.pages) == 0: print("Coudln't find pages, skipping") continue page = reader.pages[0] print(page.extract_text()) exit(0)