⚡ Adds parsing 2022 pdf results
This commit is contained in:
parent
96a6810fcd
commit
8e4cac0789
30
get_data.py
30
get_data.py
@ -44,8 +44,10 @@ if ok.lower() == "n":
|
|||||||
|
|
||||||
|
|
||||||
counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
|
counties = ["BA", "TT", "TN", "NR", "ZA", "BB", "PO", "KE"]
|
||||||
semifull_counties = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj",
|
counties2023 = ["Bratislavský kraj", "Trnavský kraj", "Trenčianský kraj", "Nitriansky kraj",
|
||||||
"Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"]
|
"Žilinský kraj", "Banskobystrický kraj", "Prešovský kraj", "Košický kraj"]
|
||||||
|
counties2022 = ["Bratislavský", "Trnavský", "Trenčiansky", "Nitriansky",
|
||||||
|
"Žilinský", "Banskobystrický", "Prešovský", "Košický"]
|
||||||
|
|
||||||
|
|
||||||
def extract_2024(cat_str: str, lines: List[str]):
|
def extract_2024(cat_str: str, lines: List[str]):
|
||||||
@ -71,16 +73,26 @@ def extract_2024(cat_str: str, lines: List[str]):
|
|||||||
def extract_2023(lines: List[str]):
|
def extract_2023(lines: List[str]):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for i in range(len(lines)):
|
for line in lines:
|
||||||
line = lines[i]
|
if line not in counties2023:
|
||||||
if line[0:2] not in ["2.", "3.", "4.", "5.", "6."]:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if lines[i-1] not in semifull_counties:
|
results.append(counties[counties2023.index(line)])
|
||||||
print("County not recognized: ", lines[i-1])
|
if len(results) == 5:
|
||||||
|
return results
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def extract_2022(lines: List[str]):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
word = line.strip().split(" ")[-1].strip()
|
||||||
|
if word not in counties2022:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
results.append(counties[semifull_counties.index(lines[i-1])])
|
results.append(counties[counties2022.index(word)])
|
||||||
if len(results) == 5:
|
if len(results) == 5:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@ -108,7 +120,7 @@ for pdf in pdfs:
|
|||||||
cat = int(max(matches[0]))
|
cat = int(max(matches[0]))
|
||||||
cat_str = f"{cat:>02}"
|
cat_str = f"{cat:>02}"
|
||||||
|
|
||||||
if year != 2023:
|
if year != 2022:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
id = f"{year}-{cat_str}"
|
id = f"{year}-{cat_str}"
|
||||||
@ -130,6 +142,8 @@ for pdf in pdfs:
|
|||||||
results = extract_2024(cat_str, lines)
|
results = extract_2024(cat_str, lines)
|
||||||
case 2023:
|
case 2023:
|
||||||
results = extract_2023(lines)
|
results = extract_2023(lines)
|
||||||
|
case 2022:
|
||||||
|
results = extract_2022(lines)
|
||||||
print("got results: ", results)
|
print("got results: ", results)
|
||||||
|
|
||||||
exit(0)
|
exit(0)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user