From 9b90fb2db31d6f25ecb074e9fdd7cd78d2b472de Mon Sep 17 00:00:00 2001 From: Daniel Svitan Date: Sat, 21 Dec 2024 18:42:14 +0100 Subject: [PATCH] :construction: Adds (non-working) boxplot --- DATASET.md | 2 +- analyze.py | 38 +++++++++++++++++++++++++++++++++++++- analyze_sex.py | 20 +++++++++++++------- 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/DATASET.md b/DATASET.md index e22404f..2b63be1 100644 --- a/DATASET.md +++ b/DATASET.md @@ -15,7 +15,7 @@ The cleaned dataset will have the following structure: | 8 | Living | enum | [0-4] | | 9 | Commute | enum | [0-4] | | 10 | Sleep | enum | [0-2] | -| 11 | Absence | int | - | +| 11 | Absence | int | [0-∞] | It will be saved in a `.npy` file (numpy format) diff --git a/analyze.py b/analyze.py index 20e5888..a285936 100644 --- a/analyze.py +++ b/analyze.py @@ -1,7 +1,14 @@ from typing import List +import argparse import numpy as np import scipy.stats as stats +import matplotlib.pyplot as plt + +parser = argparse.ArgumentParser() +parser.add_argument("-g", "--graph", action="store_true", default=False, help="Plot graph") +args = parser.parse_args() +graph = args.graph def analyze(name: str, data: List[np.ndarray]): @@ -11,8 +18,37 @@ def analyze(name: str, data: List[np.ndarray]): if p > 0.05: print("statistically insignificant\n") - return + return F, p print("statistically significant") tukey_results = stats.tukey_hsd(*data) print(tukey_results) + + return F, p + + +def plot_box(data, labels, Fs, ps, title, titles): + if not graph: + return + + fig, axs = plt.subplots(2, 2, sharex=True) + fig.suptitle(title) + fig.set_size_inches(12, 9) + + for i in range(2): + for j in range(2): + print(f"{i}x{j} giving {i * 2 + j}") + axs[i, j].boxplot(data[i * 2 + j], labels=labels) + axs[i, j].set_title(titles[i * 2 + j]) + + F = round(Fs[i * 2 + j], 2) + p = round(ps[i * 2 + j], 4) + axs[i, j].text(0.01, 0.99, f"F-stat: {F}\np-val: {p}", ha="left", va="top", transform=axs[i, j].transAxes, + fontweight="bold") + + avgs = np.array([a.mean() for a in data[i * 2 + j]]) + print(avgs) + + fig.tight_layout() + fig.show() + plt.show() diff --git a/analyze_sex.py b/analyze_sex.py index ef761f1..e2be9a3 100644 --- a/analyze_sex.py +++ b/analyze_sex.py @@ -1,6 +1,6 @@ import numpy as np -from analyze import analyze +from analyze import analyze, plot_box dataset = np.load("clean.npy") print(f"dataset shape: {dataset.shape}; analyzing column 1 (sex)") @@ -11,13 +11,19 @@ print("") def analyze_sex(name: str, col: np.ndarray): sex_col = dataset[:, 1] - analyze(name, [ + data = [ col[sex_col == 0], col[sex_col == 1] - ]) + ] + F, p = analyze(name, data) + return data, F, p -analyze_sex("gpa", dataset[:, 2]) -analyze_sex("math", dataset[:, 3]) -analyze_sex("slovak", dataset[:, 4]) -analyze_sex("english", dataset[:, 5]) +data_gpa, F_gpa, p_gpa = analyze_sex("gpa", dataset[:, 2]) +data_math, F_math, p_math = analyze_sex("math", dataset[:, 3]) +data_slovak, F_slovak, p_slovak = analyze_sex("slovak", dataset[:, 4]) +data_english, F_english, p_english = analyze_sex("english", dataset[:, 5]) + +plot_box([data_gpa, data_math, data_slovak, data_english], ["Female", "Male"], + [F_gpa, F_math, F_slovak, F_english], [p_gpa, p_math, p_slovak, p_english], + "Pohlavie", ["Priemer", "Matematika", "Slovenčina", "Angličtina"])