✨ Adds cleaning dataset

2024-12-14 20:19:27 +01:00 · 2024-12-14 20:19:27 +01:00 · 1f150cb52c
commit 1f150cb52c
3 changed files with 185 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+.idea/
+.zed/
+.vscode/
+venv/
+
+*.zip
+*.csv
+*.npy
--- a/clean.py
+++ b/clean.py
@ -0,0 +1,166 @@
+import argparse
+import numpy as np
+import pandas as pd
+import re
+
+parser = argparse.ArgumentParser(
+    prog="clean"
+)
+parser.add_argument("-i", "--input", required=True, help="Input dirty csv file")
+parser.add_argument("-o", "--output", default="clean", help="Output clean csv file")
+args = parser.parse_args()
+
+# HEADERS: ["timestamp", "grade", "sex", "average grade", "math grade", "slovak grade", "english grade", "ses", "occupation", "living situation", "commute length", "sleep", "absence"]
+df = pd.read_csv(args.input)
+arr = df.to_numpy()
+clean = []
+
+
+# debugging purposes
+# print(list([arr[i][12] for i in range(1, 20)]))
+# exit(0)
+
+
+def parse_gpa(txt: str) -> float:
+    num_regex = r"\d+([,.]\d*)?"
+    eu_num_regex = r"\d+(,\d*)?"
+
+    txt = txt.strip()
+    is_num = re.fullmatch(num_regex, txt) is not None
+    if not is_num:
+        print(f"ERROR: Couldn't parse gpa '{txt}'")
+        ret = None
+        while ret is None:
+            fixed = input("Please enter fixed value: ")
+            try:
+                ret = float(fixed)
+            except ValueError:
+                pass
+        return float(fixed)
+
+    is_eu = re.fullmatch(eu_num_regex, txt) is not None
+    if is_eu:
+        txt = txt.replace(",", ".")
+
+    return float(txt)
+
+
+def parse_ses(txt: str) -> int:
+    if txt.startswith("Nižšia trieda"):
+        return 0
+    elif txt.startswith("Stredná trieda"):
+        return 1
+    elif txt.startswith("Vyššia trieda"):
+        return 2
+    else:
+        print("ERROR: Couldn't determine SES")
+        return 3
+
+
+def parse_occupation(txt: str) -> int:
+    match txt:
+        case "Pracujem 10 hodín a viac týždenne":
+            return 0
+        case "Pracujem menej ako 10 hodín týždenne":
+            return 1
+        case "Športujem na profesionálnej alebo polo-profesionálnej úrovni":
+            return 2
+        case "Robím muziku na profesionálnej alebo polo-profesionálnej úrovni":
+            return 3
+        case "Robím inú profesionálnu alebo polo-profesionálnu aktivitu":
+            return 4
+        case "Nie":
+            return 5
+        case _:
+            print("ERROR: Couldn't determine occupation")
+            return 6
+
+
+def parse_living(txt: str) -> int:
+    match txt:
+        case "Bývam s rodičmi":
+            return 0
+        case "Bývam s iným rodinným príslušníkom/čkou":
+            return 1
+        case "Bývam sám alebo so spolubývajúcim/ou":
+            return 2
+        case "Bývam na intráku":
+            return 3
+        case "Mám to inak":
+            return 4
+        case _:
+            print("ERROR: Couldn't determine living")
+            return 5
+
+
+def parse_commute(txt: str) -> int:
+    match txt:
+        case "Bývam na intráku":
+            return 0
+        case "Menej ako 15 minút":
+            return 1
+        case "Menej ako 30 minút":
+            return 2
+        case "Menej ako hodinu":
+            return 3
+        case "Viac ako hodinu":
+            return 4
+        case _:
+            print("ERROR: Couldn't determine commute")
+            return 5
+
+
+def parse_sleep(txt: str) -> int:
+    match txt:
+        case "9 hodín a viac":
+            return 0
+        case "7 až 9 hodín":
+            return 1
+        case "6 hodín a menej":
+            return 2
+        case _:
+            print("ERROR: Coudln't determine sleep")
+            return 3
+
+
+def parse_absence(txt: str) -> float:
+    while True:
+        try:
+            return float(txt)
+        except ValueError:
+            print(f"ERROR: Couldn't parse absence '{txt}'")
+            txt = input("Please enter fixed value: ")
+
+
+for i in range(1, len(df)):
+    row = arr[i]
+    current = []
+
+    grade = row[1]
+    sex = row[2]
+    gpa = row[3]
+    math = row[4]
+    slovak = row[5]
+    english = row[6]
+    ses = row[7]
+    occupation = row[8]
+    living = row[9]
+    commute = row[10]
+    sleep = row[11]
+    absence = row[12]
+
+    current.append(grade)
+    current.append(0 if sex == "Žena" else 1)  # zena = 0, muz = 1
+    current.append(parse_gpa(gpa))
+    current.append(math)
+    current.append(slovak)
+    current.append(english)
+    current.append(parse_occupation(occupation))
+    current.append(parse_living(living))
+    current.append(parse_commute(commute))
+    current.append(parse_absence(absence))
+
+    clean.append(np.array(current))
+
+print(f"Saving {len(arr)} rows")
+np.save(args.output, np.array(clean))
--- a/print.py
+++ b/print.py
@ -0,0 +1,11 @@
+import argparse
+import numpy as np
+
+parser = argparse.ArgumentParser(
+    prog="print"
+)
+parser.add_argument("-i", "--input", default="clean.npy", help="Input npy file")
+args = parser.parse_args()
+
+arr = np.load(args.input, allow_pickle=False)
+print(arr)