132 lines
4.8 KiB
Python
132 lines
4.8 KiB
Python
import argparse
|
|
|
|
import numpy as np
|
|
import scipy.stats as stats
|
|
import matplotlib.pyplot as plt
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("-g", "--graph", action="store_true", default=False, help="Plot graph")
|
|
parser.add_argument("-s", "--save", default="", help="Graph save location")
|
|
args = parser.parse_args()
|
|
graph = args.graph
|
|
save = args.save
|
|
|
|
colors = ["lightblue", "lightgreen", "lightcoral"]
|
|
edge_colors = ["blue", "green", "red"]
|
|
|
|
dataset = np.load("clean.npy")
|
|
print(f"dataset shape: {dataset.shape}; analyzing column 11 (absence)")
|
|
print("\tinteger value")
|
|
print("")
|
|
|
|
|
|
def analyze_absence(name: str, col: np.ndarray):
|
|
absence_col = dataset[:, 11]
|
|
tau, p = stats.kendalltau(absence_col, col)
|
|
print(f"ken-tau for {name}: {tau}")
|
|
print(f"p-value for {name}: {p}")
|
|
|
|
if p > 0.05:
|
|
print("statistically insignificant\n")
|
|
else:
|
|
print("statistically significant\n")
|
|
|
|
return (absence_col, col), tau, p
|
|
|
|
|
|
data_gpa, tau_gpa, p_gpa = analyze_absence("gpa", dataset[:, 2])
|
|
data_math, tau_math, p_math = analyze_absence("math", dataset[:, 3])
|
|
data_slovak, tau_slovak, p_slovak = analyze_absence("slovak", dataset[:, 4])
|
|
data_english, tau_english, p_english = analyze_absence("english", dataset[:, 5])
|
|
data = [data_gpa, data_math, data_slovak, data_english]
|
|
taus = [tau_gpa, tau_math, tau_slovak, tau_english]
|
|
ps = [p_gpa, p_math, p_slovak, p_english]
|
|
|
|
if not graph:
|
|
exit(0)
|
|
|
|
grade_names = ["Priemer", "Matematika", "Slovenčina", "Angličtina"]
|
|
grade_name_labels = ["Priemer známok", "Známka z matematiky", "Známka zo slovenčiny", "Známka z angličtiny"]
|
|
|
|
fig, axs = plt.subplots(2, 2)
|
|
fig.suptitle("Absencia")
|
|
fig.set_size_inches(12, 9)
|
|
|
|
for j in range(2):
|
|
for k in range(2):
|
|
index = j * 2 + k
|
|
step = 1 if index > 0 else 0.5
|
|
|
|
if index == 0:
|
|
x = dataset[:, 11] # absence
|
|
y = dataset[:, 2] # gpa
|
|
axs[j, k].scatter(x, y)
|
|
axs[j, k].set_xlabel("Počet vymeškaných hodín")
|
|
axs[j, k].set_ylabel(grade_name_labels[index])
|
|
|
|
# trendline
|
|
z = np.polyfit(x, y, 1)
|
|
p = np.poly1d(z)
|
|
|
|
axs[j, k].plot(x, p(x), color="gray")
|
|
else:
|
|
current = list([data[index][0][data[index][1] == i + 1] for i in range(5)]) # i wanna kms
|
|
# data[index][0] = absence
|
|
# data[index][1] = grade
|
|
# data[index][0][where this specific grade] -> absences for that sepcific grade
|
|
# iterate 1 through 5 and plug into ^^
|
|
|
|
parts = axs[j, k].violinplot(list([x if len(x) else [0] for x in current]), showmeans=True,
|
|
showmedians=True)
|
|
axs[j, k].set_xticks(np.arange(1, 6), labels=["1", "2", "3", "4", "5"])
|
|
axs[j, k].set_xlabel(grade_name_labels[index])
|
|
axs[j, k].set_ylabel("Počet vymeškaných hodín")
|
|
|
|
# q1-q3 lines
|
|
for ind, vec in enumerate(current):
|
|
if len(vec) == 0:
|
|
continue
|
|
quartile1, median, quartile3 = np.percentile(vec, [25, 50, 75])
|
|
print(quartile1, median, quartile3)
|
|
axs[j, k].vlines(ind + 1, quartile1, quartile3, color="gray", linewidths=3)
|
|
|
|
parts["cmeans"].set_color("red")
|
|
parts["cmedians"].set_color("green")
|
|
|
|
for i, part in enumerate(parts["bodies"]):
|
|
part.set_facecolor(colors[i % len(colors)])
|
|
part.set_edgecolor(edge_colors[i % len(edge_colors)])
|
|
|
|
axs[j, k].set_title(grade_names[index])
|
|
|
|
tau = taus[index]
|
|
p = ps[index]
|
|
axs[j, k].text(0.01, 0.99, f"Tau τ: {tau:.4f}\np-val: {p:.4f}", ha="left", va="top",
|
|
transform=axs[j, k].transAxes,
|
|
fontweight="bold")
|
|
|
|
if index:
|
|
axs[j, k].text(0.99, 0.99,
|
|
f"Na ľavo - priemer (červená)\nNa pravo - medián (zelená)\nSivá - medzi kvartilom 1 a 3",
|
|
ha="right",
|
|
va="top",
|
|
transform=axs[j, k].transAxes)
|
|
|
|
current = list([data[index][0][data[index][1] == i + 1] for i in range(5)]) # i wanna kms
|
|
medians = list([np.median(a) if len(a) else -1 for a in current])
|
|
means = list([a.mean() if len(a) else -1 for a in current])
|
|
for l in range(len(current)):
|
|
median = medians[l]
|
|
mean = means[l]
|
|
# left - mean, right - median
|
|
if median >= 0:
|
|
axs[j, k].text(l + 1.14, median - 5, f"{median:.2f}", color="green", ha="left")
|
|
if mean >= 0:
|
|
axs[j, k].text(l + 0.85, mean - 5, f"{mean:.2f}", color="red", ha="right")
|
|
|
|
fig.tight_layout()
|
|
if save != "":
|
|
plt.savefig(save)
|
|
else:
|
|
plt.show()
|