✨ Adds downloading pdfs
This commit is contained in:
parent
f06cfcfdf2
commit
8a89952da1
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data/
|
61
find_pdfs.py
61
find_pdfs.py
@ -1,20 +1,27 @@
|
||||
import re
|
||||
import requests
|
||||
import urllib.request
|
||||
from bs4 import BeautifulSoup
|
||||
from pypdf import PdfReader
|
||||
|
||||
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
|
||||
soup = BeautifulSoup(r.content, "html.parser")
|
||||
links = soup.find_all("a")
|
||||
|
||||
year_regex = r"(?<=wp-content\/uploads\/)20\d\d(?=\/)"
|
||||
year202x = r"(?<=\/uploads\/)202\d(?=\/)"
|
||||
year2019 = r"2019\/05"
|
||||
year2018 = r"2019\/02\/(?!Vysledkova-listina-CK-SOC-2017)"
|
||||
year2017 = r"SOC-2017-odbor"
|
||||
category = r"((?<!20)[01]\d((?=-)|(?=\.pdf)))|((?<=\/)[01]\d(?=Vysled))|((?<=\.)\d(?=\.pdf))|(2(?=-final))" # don't ask, it works
|
||||
|
||||
pdfs = []
|
||||
for link in links:
|
||||
if ".pdf" not in link.get("href"):
|
||||
continue
|
||||
|
||||
href = link.get("href")
|
||||
if not ("2024/04/" in href or \
|
||||
"2023/06/" in href or \
|
||||
("2023/06/" in href and "Pisomne" not in href) or \
|
||||
"2022/05/" in href or \
|
||||
("2021/05/" in href and "Vysled" in href) or \
|
||||
("2020/05/" in href and "Odbor" in href) or \
|
||||
@ -26,3 +33,53 @@ for link in links:
|
||||
if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
|
||||
href = "https://siov.sk" + href
|
||||
print(href)
|
||||
pdfs.append(href)
|
||||
|
||||
|
||||
ok = input("Continue? [Y/n] ")
|
||||
if ok.lower() == "n":
|
||||
print("okay, bye")
|
||||
exit(0)
|
||||
|
||||
|
||||
counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
|
||||
|
||||
|
||||
def extract_2024(cat_str: str, text: str):
|
||||
pass
|
||||
|
||||
|
||||
for pdf in pdfs:
|
||||
year = 0
|
||||
cat = 0
|
||||
if matches := re.findall(year202x, pdf):
|
||||
year = int(matches[0])
|
||||
elif re.findall(year2019, pdf):
|
||||
year = 2019
|
||||
elif re.findall(year2018, pdf):
|
||||
year = 2018
|
||||
elif re.findall(year2017, pdf):
|
||||
year = 2017
|
||||
else:
|
||||
print("Couldn't get year, skipping")
|
||||
continue
|
||||
|
||||
if not (matches := re.findall(category, pdf)):
|
||||
print("Coudln't get category, skipping")
|
||||
continue
|
||||
cat = int(max(matches[0]))
|
||||
|
||||
id = f"{year}-{cat:>02}"
|
||||
path = f"data/r{id}.pdf"
|
||||
print(f"Downloading {id} - '{pdf}' -> '{path}'")
|
||||
urllib.request.urlretrieve(pdf, path)
|
||||
|
||||
reader = PdfReader(path)
|
||||
if len(reader.pages) == 0:
|
||||
print("Coudln't find pages, skipping")
|
||||
continue
|
||||
page = reader.pages[0]
|
||||
print(page.extract_text())
|
||||
|
||||
exit(0)
|
||||
|
||||
|
1
pdfs.txt
1
pdfs.txt
@ -15,7 +15,6 @@ https://siov.sk/wp-content/uploads/2024/04/14-Tvorba-ucebnych-pomocok-didakticke
|
||||
https://siov.sk/wp-content/uploads/2024/04/15-Ekonomika-a-riadenie.pdf
|
||||
https://siov.sk/wp-content/uploads/2024/04/16-Teoria-kultury-umenie-umelecka-odevna-tvorba.pdf
|
||||
https://siov.sk/wp-content/uploads/2024/04/17-Pedagogika-psychologia-sociologia-1.pdf
|
||||
https://siov.sk/wp-content/uploads/2023/06/Pisomne-zhodnotenie-urovne-odbornych-prac-SOC-na-CK-2023.pdf
|
||||
https://siov.sk/wp-content/uploads/2023/06/01VysledkovaListina2023.pdf
|
||||
https://siov.sk/wp-content/uploads/2023/06/02VysledkovaListina2023.pdf
|
||||
https://siov.sk/wp-content/uploads/2023/06/03VysledkovaListina2023.pdf
|
||||
|
Loading…
x
Reference in New Issue
Block a user