74 lines
2.0 KiB
Python
74 lines
2.0 KiB
Python
import re
|
|
import requests
|
|
import urllib.request
|
|
from bs4 import BeautifulSoup
|
|
from typing import List
|
|
|
|
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
|
|
soup = BeautifulSoup(r.content, "html.parser")
|
|
links = soup.find_all("a")
|
|
|
|
year202x = r"(?<=\/uploads\/)202\d(?=\/)"
|
|
year2019 = r"2019\/05"
|
|
year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)"
|
|
year2017 = r"SOC-2017-odbor"
|
|
category = r"((?<!20)[01]\d((?=-)|(?=\.pdf)))|((?<=\/)[01]\d(?=Vysled))|((?<=\.)\d(?=\.pdf))|(2(?=-final))" # don't ask, it works
|
|
|
|
pdfs = []
|
|
for link in links:
|
|
if ".pdf" not in link.get("href"):
|
|
continue
|
|
|
|
href = link.get("href")
|
|
if not ("2024/04/" in href or \
|
|
("2023/06/" in href and "Pisomne" not in href) or \
|
|
"2022/05/" in href or \
|
|
("2021/05/" in href and "Vysled" in href) or \
|
|
("2020/05/" in href and "Odbor" in href) or \
|
|
("2019/05/" in href and "vysled" in href) or \
|
|
("2019/02/" in href and "Vysled" in href)
|
|
):
|
|
continue
|
|
|
|
if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
|
|
href = "https://siov.sk" + href
|
|
pdfs.append(href)
|
|
|
|
|
|
print(f"Found {len(pdfs)} pdfs")
|
|
ok = input("Continue? [Y/n] ")
|
|
if ok.lower() == "n":
|
|
print("okay, bye")
|
|
exit(0)
|
|
|
|
|
|
for pdf in pdfs:
|
|
year = 0
|
|
cat = 0
|
|
if matches := re.findall(year202x, pdf):
|
|
year = int(matches[0])
|
|
elif re.findall(year2019, pdf):
|
|
year = 2019
|
|
elif re.findall(year2018, pdf):
|
|
year = 2018
|
|
elif re.findall(year2017, pdf):
|
|
year = 2017
|
|
else:
|
|
print("Couldn't get year, skipping")
|
|
continue
|
|
|
|
if not (matches := re.findall(category, pdf)):
|
|
print("Coudln't get category, skipping")
|
|
continue
|
|
# category
|
|
cat = int(max(matches[0]))
|
|
cat_str = f"{cat:>02}"
|
|
|
|
id = f"{year}-{cat_str}"
|
|
path = f"data/r{id}.pdf"
|
|
print(f"Downloading {id} - '{pdf}' -> '{path}'")
|
|
try:
|
|
urllib.request.urlretrieve(pdf, path)
|
|
except Exception as e:
|
|
print(e)
|