diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8fce603 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +data/ diff --git a/find_pdfs.py b/find_pdfs.py index 1ddffe5..af22714 100644 --- a/find_pdfs.py +++ b/find_pdfs.py @@ -1,20 +1,27 @@ import re import requests +import urllib.request from bs4 import BeautifulSoup +from pypdf import PdfReader r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/") soup = BeautifulSoup(r.content, "html.parser") links = soup.find_all("a") -year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)" +year202x = r"(?<=\/uploads\/)202\d(?=\/)" +year2019 = r"2019\/05" +year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)" +year2017 = r"SOC-2017-odbor" +category = r"((?02}" + path = f"data/r{id}.pdf" + print(f"Downloading {id} - '{pdf}' -> '{path}'") + urllib.request.urlretrieve(pdf, path) + + reader = PdfReader(path) + if len(reader.pages) == 0: + print("Coudln't find pages, skipping") + continue + page = reader.pages[0] + print(page.extract_text()) + + exit(0) + diff --git a/pdfs.txt b/pdfs.txt index c554974..58e91e4 100644 --- a/pdfs.txt +++ b/pdfs.txt @@ -15,7 +15,6 @@ https://siov.sk/wp-content/uploads/2024/04/14-Tvorba-ucebnych-pomocok-didakticke https://siov.sk/wp-content/uploads/2024/04/15-Ekonomika-a-riadenie.pdf https://siov.sk/wp-content/uploads/2024/04/16-Teoria-kultury-umenie-umelecka-odevna-tvorba.pdf https://siov.sk/wp-content/uploads/2024/04/17-Pedagogika-psychologia-sociologia-1.pdf -https://siov.sk/wp-content/uploads/2023/06/Pisomne-zhodnotenie-urovne-odbornych-prac-SOC-na-CK-2023.pdf https://siov.sk/wp-content/uploads/2023/06/01VysledkovaListina2023.pdf https://siov.sk/wp-content/uploads/2023/06/02VysledkovaListina2023.pdf https://siov.sk/wp-content/uploads/2023/06/03VysledkovaListina2023.pdf