diff --git a/get_data.py b/get_data.py index af22714..e371c65 100644 --- a/get_data.py +++ b/get_data.py @@ -3,6 +3,7 @@ import requests import urllib.request from bs4 import BeautifulSoup from pypdf import PdfReader +from typing import List r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/") soup = BeautifulSoup(r.content, "html.parser") @@ -32,10 +33,10 @@ for link in links: if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"): href = "https://siov.sk" + href - print(href) pdfs.append(href) +print(f"Found {len(pdfs)} pdfs") ok = input("Continue? [Y/n] ") if ok.lower() == "n": print("okay, bye") @@ -43,10 +44,47 @@ if ok.lower() == "n": counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"] +semifull_counties = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj", + "Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"] -def extract_2024(cat_str: str, text: str): - pass +def extract_2024(cat_str: str, lines: List[str]): + results = [] + lines = text.split("\n") + + for i in range(len(lines)): + line = lines[i] + if not line.startswith(cat_str): + continue + + county = lines[i-1][-2:] + if county not in counties: + continue + + results.append(county) + if len(results) == 5: + return results + + return results + + +def extract_2023(lines: List[str]): + results = [] + + for i in range(len(lines)): + line = lines[i] + if line[0:2] not in ["2.", "3.", "4.", "5.", "6."]: + continue + + if lines[i-1] not in semifull_counties: + print("County not recognized: ", lines[i-1]) + continue + + results.append(counties[semifull_counties.index(lines[i-1])]) + if len(results) == 5: + return results + + return results for pdf in pdfs: @@ -68,8 +106,12 @@ for pdf in pdfs: print("Coudln't get category, skipping") continue cat = int(max(matches[0])) + cat_str = f"{cat:>02}" - id = f"{year}-{cat:>02}" + if year != 2023: + continue + + id = f"{year}-{cat_str}" path = f"data/r{id}.pdf" print(f"Downloading {id} - '{pdf}' -> '{path}'") urllib.request.urlretrieve(pdf, path) @@ -79,7 +121,16 @@ for pdf in pdfs: print("Coudln't find pages, skipping") continue page = reader.pages[0] - print(page.extract_text()) + + text = page.extract_text() + lines = text.split("\n") + results = [] + match year: + case 2024: + results = extract_2024(cat_str, lines) + case 2023: + results = extract_2023(lines) + print("got results: ", results) exit(0)