⚡ Adds parsing 2024 and 2023 pdf results
This commit is contained in:
parent
23c2e3a0e7
commit
96a6810fcd
61
get_data.py
61
get_data.py
@ -3,6 +3,7 @@ import requests
|
|||||||
import urllib.request
|
import urllib.request
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
from typing import List
|
||||||
|
|
||||||
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
|
r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/")
|
||||||
soup = BeautifulSoup(r.content, "html.parser")
|
soup = BeautifulSoup(r.content, "html.parser")
|
||||||
@ -32,10 +33,10 @@ for link in links:
|
|||||||
|
|
||||||
if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
|
if not href.startswith("https://siov.sk") and not href.startswith("http://siov.sk"):
|
||||||
href = "https://siov.sk" + href
|
href = "https://siov.sk" + href
|
||||||
print(href)
|
|
||||||
pdfs.append(href)
|
pdfs.append(href)
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Found {len(pdfs)} pdfs")
|
||||||
ok = input("Continue? [Y/n] ")
|
ok = input("Continue? [Y/n] ")
|
||||||
if ok.lower() == "n":
|
if ok.lower() == "n":
|
||||||
print("okay, bye")
|
print("okay, bye")
|
||||||
@ -43,10 +44,47 @@ if ok.lower() == "n":
|
|||||||
|
|
||||||
|
|
||||||
counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
|
counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
|
||||||
|
semifull_counties = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj",
|
||||||
|
"Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"]
|
||||||
|
|
||||||
|
|
||||||
def extract_2024(cat_str: str, text: str):
|
def extract_2024(cat_str: str, lines: List[str]):
|
||||||
pass
|
results = []
|
||||||
|
lines = text.split("\n")
|
||||||
|
|
||||||
|
for i in range(len(lines)):
|
||||||
|
line = lines[i]
|
||||||
|
if not line.startswith(cat_str):
|
||||||
|
continue
|
||||||
|
|
||||||
|
county = lines[i-1][-2:]
|
||||||
|
if county not in counties:
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append(county)
|
||||||
|
if len(results) == 5:
|
||||||
|
return results
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def extract_2023(lines: List[str]):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i in range(len(lines)):
|
||||||
|
line = lines[i]
|
||||||
|
if line[0:2] not in ["2.", "3.", "4.", "5.", "6."]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if lines[i-1] not in semifull_counties:
|
||||||
|
print("County not recognized: ", lines[i-1])
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append(counties[semifull_counties.index(lines[i-1])])
|
||||||
|
if len(results) == 5:
|
||||||
|
return results
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
for pdf in pdfs:
|
for pdf in pdfs:
|
||||||
@ -68,8 +106,12 @@ for pdf in pdfs:
|
|||||||
print("Coudln't get category, skipping")
|
print("Coudln't get category, skipping")
|
||||||
continue
|
continue
|
||||||
cat = int(max(matches[0]))
|
cat = int(max(matches[0]))
|
||||||
|
cat_str = f"{cat:>02}"
|
||||||
|
|
||||||
id = f"{year}-{cat:>02}"
|
if year != 2023:
|
||||||
|
continue
|
||||||
|
|
||||||
|
id = f"{year}-{cat_str}"
|
||||||
path = f"data/r{id}.pdf"
|
path = f"data/r{id}.pdf"
|
||||||
print(f"Downloading {id} - '{pdf}' -> '{path}'")
|
print(f"Downloading {id} - '{pdf}' -> '{path}'")
|
||||||
urllib.request.urlretrieve(pdf, path)
|
urllib.request.urlretrieve(pdf, path)
|
||||||
@ -79,7 +121,16 @@ for pdf in pdfs:
|
|||||||
print("Coudln't find pages, skipping")
|
print("Coudln't find pages, skipping")
|
||||||
continue
|
continue
|
||||||
page = reader.pages[0]
|
page = reader.pages[0]
|
||||||
print(page.extract_text())
|
|
||||||
|
text = page.extract_text()
|
||||||
|
lines = text.split("\n")
|
||||||
|
results = []
|
||||||
|
match year:
|
||||||
|
case 2024:
|
||||||
|
results = extract_2024(cat_str, lines)
|
||||||
|
case 2023:
|
||||||
|
results = extract_2023(lines)
|
||||||
|
print("got results: ", results)
|
||||||
|
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user