🚧 Adds (non-working) boxplot

This commit is contained in:
Daniel Svitan 2024-12-21 18:42:14 +01:00
parent 2b0829c78e
commit 9b90fb2db3
3 changed files with 51 additions and 9 deletions

View File

@ -15,7 +15,7 @@ The cleaned dataset will have the following structure:
| 8 | Living | enum | [0-4] | | 8 | Living | enum | [0-4] |
| 9 | Commute | enum | [0-4] | | 9 | Commute | enum | [0-4] |
| 10 | Sleep | enum | [0-2] | | 10 | Sleep | enum | [0-2] |
| 11 | Absence | int | - | | 11 | Absence | int | [0-∞] |
It will be saved in a `.npy` file (numpy format) It will be saved in a `.npy` file (numpy format)

View File

@ -1,7 +1,14 @@
from typing import List from typing import List
import argparse
import numpy as np import numpy as np
import scipy.stats as stats import scipy.stats as stats
import matplotlib.pyplot as plt
parser = argparse.ArgumentParser()
parser.add_argument("-g", "--graph", action="store_true", default=False, help="Plot graph")
args = parser.parse_args()
graph = args.graph
def analyze(name: str, data: List[np.ndarray]): def analyze(name: str, data: List[np.ndarray]):
@ -11,8 +18,37 @@ def analyze(name: str, data: List[np.ndarray]):
if p > 0.05: if p > 0.05:
print("statistically insignificant\n") print("statistically insignificant\n")
return return F, p
print("statistically significant") print("statistically significant")
tukey_results = stats.tukey_hsd(*data) tukey_results = stats.tukey_hsd(*data)
print(tukey_results) print(tukey_results)
return F, p
def plot_box(data, labels, Fs, ps, title, titles):
if not graph:
return
fig, axs = plt.subplots(2, 2, sharex=True)
fig.suptitle(title)
fig.set_size_inches(12, 9)
for i in range(2):
for j in range(2):
print(f"{i}x{j} giving {i * 2 + j}")
axs[i, j].boxplot(data[i * 2 + j], labels=labels)
axs[i, j].set_title(titles[i * 2 + j])
F = round(Fs[i * 2 + j], 2)
p = round(ps[i * 2 + j], 4)
axs[i, j].text(0.01, 0.99, f"F-stat: {F}\np-val: {p}", ha="left", va="top", transform=axs[i, j].transAxes,
fontweight="bold")
avgs = np.array([a.mean() for a in data[i * 2 + j]])
print(avgs)
fig.tight_layout()
fig.show()
plt.show()

View File

@ -1,6 +1,6 @@
import numpy as np import numpy as np
from analyze import analyze from analyze import analyze, plot_box
dataset = np.load("clean.npy") dataset = np.load("clean.npy")
print(f"dataset shape: {dataset.shape}; analyzing column 1 (sex)") print(f"dataset shape: {dataset.shape}; analyzing column 1 (sex)")
@ -11,13 +11,19 @@ print("")
def analyze_sex(name: str, col: np.ndarray): def analyze_sex(name: str, col: np.ndarray):
sex_col = dataset[:, 1] sex_col = dataset[:, 1]
analyze(name, [ data = [
col[sex_col == 0], col[sex_col == 0],
col[sex_col == 1] col[sex_col == 1]
]) ]
F, p = analyze(name, data)
return data, F, p
analyze_sex("gpa", dataset[:, 2]) data_gpa, F_gpa, p_gpa = analyze_sex("gpa", dataset[:, 2])
analyze_sex("math", dataset[:, 3]) data_math, F_math, p_math = analyze_sex("math", dataset[:, 3])
analyze_sex("slovak", dataset[:, 4]) data_slovak, F_slovak, p_slovak = analyze_sex("slovak", dataset[:, 4])
analyze_sex("english", dataset[:, 5]) data_english, F_english, p_english = analyze_sex("english", dataset[:, 5])
plot_box([data_gpa, data_math, data_slovak, data_english], ["Female", "Male"],
[F_gpa, F_math, F_slovak, F_english], [p_gpa, p_math, p_slovak, p_english],
"Pohlavie", ["Priemer", "Matematika", "Slovenčina", "Angličtina"])