✨ Adds downloading pdfs

2025-04-18 18:02:53 +02:00
parent f06cfcfdf2
commit 8a89952da1
3 changed files with 60 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+data/
--- a/find_pdfs.py
+++ b/find_pdfs.py
@@ -1,20 +1,27 @@
 import re
 import requests
+import urllib.request
 from bs4 import BeautifulSoup
+from pypdf import PdfReader

 r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
 soup = BeautifulSoup(r.content, "html.parser")
 links = soup.find_all("a")

-year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)"
+year202x = r"(?<=\/uploads\/)202\d(?=\/)"
+year2019 = r"2019\/05"
+year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)"
+year2017 = r"SOC-2017-odbor"
+category = r"((?<!20)[01]\d((?=-)|(?=\.pdf)))|((?<=\/)[01]\d(?=Vysled))|((?<=\.)\d(?=\.pdf))|(2(?=-final))"  # don't ask, it works

+pdfs = []
 for link in links:
    if ".pdf" not in link.get("href"):
        continue

    href = link.get("href")
    if not ("2024/04/" in href or \
-            "2023/06/" in href or \
+            ("2023/06/" in href and "Pisomne" not in href) or \
            "2022/05/" in href or \
            ("2021/05/" in href and "Vysled" in href) or \
            ("2020/05/" in href and "Odbor" in href) or \
@@ -26,3 +33,53 @@ for link in links:
    if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
        href = "https://siov.sk" + href
    print(href)
+    pdfs.append(href)
+
+
+ok = input("Continue? [Y/n] ")
+if ok.lower() == "n":
+    print("okay, bye")
+    exit(0)
+
+
+counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
+
+
+def extract_2024(cat_str: str, text: str):
+    pass
+
+
+for pdf in pdfs:
+    year = 0
+    cat = 0
+    if matches := re.findall(year202x, pdf):
+        year = int(matches[0])
+    elif re.findall(year2019, pdf):
+        year = 2019
+    elif re.findall(year2018, pdf):
+        year = 2018
+    elif re.findall(year2017, pdf):
+        year = 2017
+    else:
+        print("Couldn't get year, skipping")
+        continue
+
+    if not (matches := re.findall(category, pdf)):
+        print("Coudln't get category, skipping")
+        continue
+    cat = int(max(matches[0]))
+
+    id = f"{year}-{cat:>02}"
+    path = f"data/r{id}.pdf"
+    print(f"Downloading {id} - '{pdf}' -> '{path}'")
+    urllib.request.urlretrieve(pdf, path)
+
+    reader = PdfReader(path)
+    if len(reader.pages) == 0:
+        print("Coudln't find pages, skipping")
+        continue
+    page = reader.pages[0]
+    print(page.extract_text())
+
+    exit(0)
+
--- a/pdfs.txt
+++ b/pdfs.txt
@@ -15,7 +15,6 @@ https://siov.sk/wp-content/uploads/2024/04/14-Tvorba-ucebnych-pomocok-didakticke
 https://siov.sk/wp-content/uploads/2024/04/15-Ekonomika-a-riadenie.pdf
 https://siov.sk/wp-content/uploads/2024/04/16-Teoria-kultury-umenie-umelecka-odevna-tvorba.pdf
 https://siov.sk/wp-content/uploads/2024/04/17-Pedagogika-psychologia-sociologia-1.pdf
-https://siov.sk/wp-content/uploads/2023/06/Pisomne-zhodnotenie-urovne-odbornych-prac-SOC-na-CK-2023.pdf
 https://siov.sk/wp-content/uploads/2023/06/01VysledkovaListina2023.pdf
 https://siov.sk/wp-content/uploads/2023/06/02VysledkovaListina2023.pdf
 https://siov.sk/wp-content/uploads/2023/06/03VysledkovaListina2023.pdf