diff --git a/.gitignore b/.gitignore index 8fce603..39d711e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ data/ +data.zip diff --git a/get_data.py b/get_data.py index 8c84a58..ec54bde 100644 --- a/get_data.py +++ b/get_data.py @@ -2,7 +2,6 @@ import re import requests import urllib.request from bs4 import BeautifulSoup -from pypdf import PdfReader from typing import List r = requests.get("https://siov.sk/sutaze/stredoskolska-odborna-cinnost/") @@ -43,62 +42,6 @@ if ok.lower() == "n": exit(0) -counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"] -counties2023 = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj", - "Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"] -counties2022 = ["Bratislavský", "Trnavský", "Trenčiansky", "Nitriansky", - "Žilinský", "Banskobystrický", "Prešovský", "Košický"] - - -def extract_2024(cat_str: str, lines: List[str]): - results = [] - lines = text.split("\n") - - for i in range(len(lines)): - line = lines[i] - if not line.startswith(cat_str): - continue - - county = lines[i-1][-2:] - if county not in counties: - continue - - results.append(county) - if len(results) == 5: - return results - - return results - - -def extract_2023(lines: List[str]): - results = [] - - for line in lines: - if line not in counties2023: - continue - - results.append(counties[counties2023.index(line)]) - if len(results) == 5: - return results - - return results - - -def extract_2022(lines: List[str]): - results = [] - - for line in lines: - word = line.strip().split(" ")[-1].strip() - if word not in counties2022: - continue - - results.append(counties[counties2022.index(word)]) - if len(results) == 5: - return results - - return results - - for pdf in pdfs: year = 0 cat = 0 @@ -117,34 +60,14 @@ for pdf in pdfs: if not (matches := re.findall(category, pdf)): print("Coudln't get category, skipping") continue + # category cat = int(max(matches[0])) cat_str = f"{cat:>02}" - if year != 2022: - continue - id = f"{year}-{cat_str}" path = f"data/r{id}.pdf" print(f"Downloading {id} - '{pdf}' -> '{path}'") - urllib.request.urlretrieve(pdf, path) - - reader = PdfReader(path) - if len(reader.pages) == 0: - print("Coudln't find pages, skipping") - continue - page = reader.pages[0] - - text = page.extract_text() - lines = text.split("\n") - results = [] - match year: - case 2024: - results = extract_2024(cat_str, lines) - case 2023: - results = extract_2023(lines) - case 2022: - results = extract_2022(lines) - print("got results: ", results) - - exit(0) - + try: + urllib.request.urlretrieve(pdf, path) + except Exception as e: + print(e) diff --git a/parsed.txt b/parsed.txt new file mode 100644 index 0000000..985283e --- /dev/null +++ b/parsed.txt @@ -0,0 +1,149 @@ +2017 01 ZA,PO,TT,KE,TN +2017 02 BB,TT,NR,PO,TT +2017 03 BA,BB,NR,ZA,TT +2017 04 BA,KE,ZA,ZA,BA +2017 05 TN,TT,BB,ZA,BB +2017 06 NR,KE,TT,KE,BA +2017 07 KE,NR,TN,ZA,KE +2017 08 TN,NR,BA,TN,PO +2017 09 KE,TN,KE,PO,TT +2017 10 BB,TN,ZA,NR,BB +2017 11 PO,ZA,BA,KE,BA +2017 12 ZA,KE,ZA,BB,BA +2017 13 KE,ZA,BB,PO,NR +2017 14 KE,TT,BA,ZA,BB +2017 15 ZA,TN,PO,BA,PO +2017 16 BB,TT,BA,BB,TT +2017 17 KE,ZA,TN,KE,NR +2018 01 NR,PO,NR,TN,BA +2018 02 ZA,NR,BB,TN,NR +2018 03 ZA,BB,BA,TN,PO +2018 04 TN,NR,BA,BA,KE +2018 05 PO,BB,ZA,NR,TT +2018 06 BB,ZA,TT,BA,KE +2018 07 NR,BB,TT,ZA,PO +2018 08 NR,TT,PO,BB,BA +2018 09 PO,NR,TN,KE,ZA +2018 10 KE,BB,ZA,PO,TT +2018 11 KE,BA,TN,ZA,NR +2018 12 BB,PO,KE,TT,NR +2018 13 ZA,KE,PO,NR,KE +2018 14 PO,BB,BA,TN,PO +2018 15 PO,TT,TT,BA,KE +2018 16 BB,TN,PO,BA,KE +2018 17 TT,KE,BB,ZA,TN +2019 01 PO,KE,TN,ZA,TN +2019 02 BA,TT,TT,KE,TN +2019 03 ZA,KE,TN,BA,NR +2019 04 TT,NR,ZA,NR,BB +2019 05 BA,BB,NR,NR,ZA +2019 06 BB,BB,ZA,NR,TT +2019 07 ZA,BB,TN,NR,TT +2019 08 TT,ZA,PO,BB,KE +2019 09 KE,TN,PO,TN,NR +2019 10 ZA,BB,ZA,TT,NR +2019 11 PO,BA,NR,KE,ZA +2019 12 KE,BA,NR,KE,ZA +2019 13 ZA,ZA,NR,KE,BB +2019 14 PO,BB,TN,BA,TN +2019 15 KE,TT,ZA,PO,BA +2019 16 PO,BB,BA,NR,KE +2019 17 KE,BA,PO,BA,TN +2020 01 PO,ZA,TT,PO,ZA +2020 02 ZA,ZA,KE,TT,KE +2020 03 ZA,ZA,BB,BA,PO +2020 04 NR,ZA,TN,PO,ZA +2020 05 BA,TN,ZA,PO,KE +2020 06 ZA,NR,TN,BB,KE +2020 07 ZA,PO,KE,TT,BA +2020 08 TN,BB,KE,PO,TT +2020 09 KE,TT,TT,ZA,NR +2020 10 ZA,NR,TN,PO,KE +2020 11 TT,BA,BA,ZA,ZA +2020 12 NR,KE,KE,PO,PO +2020 13 ZA,KE,NR,TN,ZA +2020 14 BB,TT,PO,ZA,KE +2020 15 KE,PO,TN,ZA,NR +2020 16 ZA,TT,ZA,NR,TT +2020 17 ZA,BB,NR,ZA,TT +2021 01 ZA,KE,KE,BA,PO +2021 02 NR,KE,PO,TN,ZA +2021 03 KE,PO,KE,TT,BB +2021 04 BA,ZA,NR,BB,PO +2021 05 PO,TN,PO,TN,ZA +2021 06 NR,BA,PO,ZA,KE +2021 07 ZA,KE,TN,TT,ZA +2021 08 TN,TT,PO,NR,BB +2021 09 PO,KE,TT,BA,TN +2021 10 ZA,NR,ZA,KE,TT +2021 11 NR,BA,ZA,KE,TT +2021 12 PO,KE,BB,BB,NR +2021 13 ZA,ZA,BA,PO,TN +2021 14 PO,BB,KE,ZA,TT +2021 15 NR,BA,ZA,TT,BB +2021 16 TT,PO,ZA,NR,BA +2021 17 TT,TN,KE,BA,PO +2022 01 ZA,NR,KE,TT,NR +2022 02 PO,KE,ZA,BA,TN +2022 03 ZA,TN,NR,PO,KE +2022 04 PO,BA,NR,ZA,PO +2022 05 BA,NR,KE,KE,NR +2022 06 ZA,NR,TN,TT,KE +2022 07 KE,PO,NR,TN,ZA +2022 08 NR,NR,PO,TN,KE +2022 09 NR,TN,KE,BA,KE +2022 10 ZA,NR,NR,BB,KE +2022 11 NR,TN,BA,PO,ZA +2022 12 TN,KE,BA,PO,BB +2022 13 ZA,PO,KE,ZA,NR +2022 14 TT,BB,KE,BB,NR +2022 15 TT,PO,ZA,ZA,BA +2022 16 TN,KE,BB,ZA,TT +2022 17 BA,ZA,PO,TN,KE +2023 01 TN,PO,BA,TT,TT +2023 02 KE,ZA,BB,TN,PO +2023 03 NR,ZA,ZA,PO,BA +2023 04 ZA,KE,ZA,BA,NR +2023 05 BA,ZA,PO,KE,BB +2023 06 ZA,NR,PO,BB,KE +2023 07 NR,TN,BB,TT,ZA +2023 08 TN,PO,TN,KE,PO +2023 09 KE,TT,BA,PO,ZA +2023 10 ZA,PO,TN,NR,ZA +2023 11 TN,ZA,BA,BB,BB +2023 12 KE,BA,TT,KE,BB +2023 13 ZA,PO,KE,KE,NR +2023 14 TN,NR,PO,TN,BA +2023 15 PO,TN,TT,PO,ZA +2023 16 BA,BB,KE,BB,TT +2023 17 KE,PO,BB,ZA,KE +2024 01 PO,KE,TN,BB,PO +2024 02 ZA,KE,NR,KE,TN +2024 03 ZA,BA,NR,KE,PO +2024 04 NR,TT,KE,BA,NR +2024 05 NR,ZA,TT,BA,BB +2024 06 NR,BB,TN,ZA,TT +2024 07 ZA,BB,BB,TT,ZA +2024 08 ZA,TT,PO,TN,ZA +2024 09 TN,BA,TN,TT,ZA +2024 10 ZA,BB,ZA,PO,NR +2024 11 BA,NR,BB,ZA,ZA +2024 12 KE,TN,ZA,BB,BB +2024 13 ZA,KE,NR,TT,ZA +2024 14 NR,TN,PO,ZA,BB +2024 15 BA,KE,ZA,PO,ZA +2024 16 PO,ZA,NR,BB,KE +2024 17 BB,PO,ZA,PO,TT + + + + + +BA - Bratislavsky +TN - Trnavsky +TT - Trencinsky +NR - Nitriansky +BB - Banskobystricky +ZA - Zilinsky +KE - Kosicky +PO - Presovsky \ No newline at end of file diff --git a/pdfs.txt b/pdfs.txt deleted file mode 100644 index 58e91e4..0000000 --- a/pdfs.txt +++ /dev/null @@ -1,136 +0,0 @@ -https://siov.sk/wp-content/uploads/2024/04/01-Problematika-volneho-casu.pdf -https://siov.sk/wp-content/uploads/2024/04/02-Matematika-fyzika.pdf -https://siov.sk/wp-content/uploads/2024/04/03-Chemia-potravinarstvo.pdf -https://siov.sk/wp-content/uploads/2024/04/04-Biologia.pdf -https://siov.sk/wp-content/uploads/2024/04/05-Zivotne-prostredie-geografia-geologia.pdf -https://siov.sk/wp-content/uploads/2024/04/06-Zdravotnictvo-farmakologia.pdf -https://siov.sk/wp-content/uploads/2024/04/07-Podohospodarstvo-polnohospodarstvo-lesne-a-vodne-hospodarstvo.pdf -https://siov.sk/wp-content/uploads/2024/04/08-Cestovny-ruch-hotelierstvo-gastronomia.pdf -https://siov.sk/wp-content/uploads/2024/04/09-Strojarstvo-hutnictvo-doprava.pdf -https://siov.sk/wp-content/uploads/2024/04/10-Stavebnictvo-geodezia-kartografia.pdf -https://siov.sk/wp-content/uploads/2024/04/11-Informatika.pdf -https://siov.sk/wp-content/uploads/2024/04/12-Elektrotechnika-hardware-mechatronika.pdf -https://siov.sk/wp-content/uploads/2024/04/13-Historia-filozofia-pravne-vedy.pdf -https://siov.sk/wp-content/uploads/2024/04/14-Tvorba-ucebnych-pomocok-didakticke-technologie.pdf -https://siov.sk/wp-content/uploads/2024/04/15-Ekonomika-a-riadenie.pdf -https://siov.sk/wp-content/uploads/2024/04/16-Teoria-kultury-umenie-umelecka-odevna-tvorba.pdf -https://siov.sk/wp-content/uploads/2024/04/17-Pedagogika-psychologia-sociologia-1.pdf -https://siov.sk/wp-content/uploads/2023/06/01VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/02VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/03VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/04VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/05VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/06VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/07VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/08VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/09VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/10VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/11VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/12VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/13VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/14VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/15VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/16VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2023/06/17VysledkovaListina2023.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-01.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-02.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-03.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-04.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-05.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-06.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-07.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-08.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-09.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-10-1.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-11.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-12.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-13.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-14.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-15.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-16.pdf -https://siov.sk/wp-content/uploads/2022/05/VL-odbor-17.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-01.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-02.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-03.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-04.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-05.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-06.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-07.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-08.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-09.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-10.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-11.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-12.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-13.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-14.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-15.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-16.pdf -https://siov.sk/wp-content/uploads/2021/05/Vysledkova-listina-odbor-17.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-01.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-02.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-03.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-04.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-05.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-06.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-07.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-08.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-09.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-10.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-11.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-12.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-13.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-14.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-15.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-16.pdf -https://siov.sk/wp-content/uploads/2020/05/Odbor-17.pdf -https://siov.sk/wp-content/uploads/2019/05/01-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/02-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/03-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/04-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/05-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/06-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/07-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/08-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/09-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/10-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/11-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/12-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/13-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/14-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/15-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/16-vysledkova-listina-SOC-2019.pdf -https://siov.sk/wp-content/uploads/2019/05/17-vysledkova-listina-SOC-2019.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledková-listina-odbor-c.1.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.2-final.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.3.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.04.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.5.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.6.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.7.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.8.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.9.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.10.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.11.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.12.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.13.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.14.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.15.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.16.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-odbor-c.17.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-01.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-02.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-03.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-04.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-05.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-06.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-07.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-08.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-09.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-10.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-11.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-12.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-13.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-14.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-15.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-16.pdf -http://siov.sk/wp-content/uploads/2019/02/Vysledkova-listina-CK-SOC-2017-odbor-17.pdf diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1190bd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4