🧐 Adds parsed data

This commit is contained in:
Daniel Svitan
2025-05-03 17:24:32 +02:00
parent 8e4cac0789
commit 85ba8d9651
5 changed files with 157 additions and 218 deletions

View File

@@ -2,7 +2,6 @@ import re
import requests
import urllib.request
from bs4 import BeautifulSoup
from pypdf import PdfReader
from typing import List
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
@@ -43,62 +42,6 @@ if ok.lower() == "n":
exit(0)
counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
counties2023 = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj",
"Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"]
counties2022 = ["Bratislavský", "Trnavský", "Trenčiansky", "Nitriansky",
"Žilinský", "Banskobystrický", "Prešovský", "Košický"]
def extract_2024(cat_str: str, lines: List[str]):
results = []
lines = text.split("\n")
for i in range(len(lines)):
line = lines[i]
if not line.startswith(cat_str):
continue
county = lines[i-1][-2:]
if county not in counties:
continue
results.append(county)
if len(results) == 5:
return results
return results
def extract_2023(lines: List[str]):
results = []
for line in lines:
if line not in counties2023:
continue
results.append(counties[counties2023.index(line)])
if len(results) == 5:
return results
return results
def extract_2022(lines: List[str]):
results = []
for line in lines:
word = line.strip().split(" ")[-1].strip()
if word not in counties2022:
continue
results.append(counties[counties2022.index(word)])
if len(results) == 5:
return results
return results
for pdf in pdfs:
year = 0
cat = 0
@@ -117,34 +60,14 @@ for pdf in pdfs:
if not (matches := re.findall(category, pdf)):
print("Coudln't get category, skipping")
continue
# category
cat = int(max(matches[0]))
cat_str = f"{cat:>02}"
if year != 2022:
continue
id = f"{year}-{cat_str}"
path = f"data/r{id}.pdf"
print(f"Downloading {id} - '{pdf}' -> '{path}'")
urllib.request.urlretrieve(pdf, path)
reader = PdfReader(path)
if len(reader.pages) == 0:
print("Coudln't find pages, skipping")
continue
page = reader.pages[0]
text = page.extract_text()
lines = text.split("\n")
results = []
match year:
case 2024:
results = extract_2024(cat_str, lines)
case 2023:
results = extract_2023(lines)
case 2022:
results = extract_2022(lines)
print("got results: ", results)
exit(0)
try:
urllib.request.urlretrieve(pdf, path)
except Exception as e:
print(e)