Compare commits

...

10 Commits

Author SHA1 Message Date
Daniel Svitan
2f3c547b55 🐛 Fixes test loss value 2025-01-06 20:12:40 +01:00
Daniel Svitan
f6eafc28ec 🔨 Changes absence grade plots to boxplots 2024-12-27 16:39:18 +01:00
Daniel Svitan
96a6599cf9 💄 Fixes minor mistakes 2024-12-27 16:13:25 +01:00
Daniel Svitan
6ddd476834 Changes posthoc to conover 2024-12-27 15:28:47 +01:00
Daniel Svitan
dc2e417969 💄 Adds colors to violins 2024-12-27 15:10:37 +01:00
Daniel Svitan
ab0d117c70 Updates gitignore 2024-12-27 13:05:42 +01:00
Daniel Svitan
f5fb3f647a 💄 Fixes mean and median on graph 2024-12-27 11:56:43 +01:00
Daniel Svitan
3ad7babcdc Adds printing group differences 2024-12-27 11:48:39 +01:00
Daniel Svitan
6831e847ff Adds automatic output saving 2024-12-23 18:08:18 +01:00
Daniel Svitan
29ab473c3c Automates analysis 2024-12-23 16:25:21 +01:00
8 changed files with 209 additions and 80 deletions

3
.gitignore vendored
View File

@ -6,6 +6,7 @@ venv/
__pycache__/ __pycache__/
results/ results/
paper/
*.zip *.zip
*.csv *.csv
@ -13,6 +14,8 @@ results/
*.jasp *.jasp
*.pth *.pth
*.png
*.drawio
*.tar.gz *.tar.gz
*.zip *.zip

2
Makefile Normal file
View File

@ -0,0 +1,2 @@
make analyze:
./analyze.sh

View File

@ -1,8 +1,11 @@
from typing import List from typing import List
import itertools
import argparse import argparse
import numpy as np import numpy as np
import pandas as pd
import scipy.stats as stats import scipy.stats as stats
import scikit_posthocs as sp
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -12,34 +15,84 @@ args = parser.parse_args()
graph = args.graph graph = args.graph
save = args.save save = args.save
colors = ["lightblue", "lightgreen", "lightcoral"]
edge_colors = ["blue", "green", "red"]
# source: mostly ChatGPT (ain't no way i'm writing this shit myself)
def analyze(name: str, data: List[np.ndarray]): def analyze(name: str, data: List[np.ndarray]):
#print(f"Checking if normally distributed for {name}")
#for i in range(len(data)):
# _, normal_p = stats.shapiro(data[i])
# if normal_p > 0.05:
# print(f"\tGroup {i}: normally distributed")
# else:
# print(f"\tGroup {i}: NOT normally distributed")
filtered_data = [] filtered_data = []
group_names = []
all_values = []
for index, item in enumerate(data): for index, item in enumerate(data):
if len(item) > 5: numeric_data = [x for x in item if isinstance(x, (int, float))]
filtered_data.append(item) if len(numeric_data) > 5:
filtered_data.append(numeric_data)
group_names.append(chr(65 + index))
all_values.extend(numeric_data)
else: else:
print(f"Data group at index {index} removed due to insufficient size ({len(item)})") print(f"Data group at index {index} removed due to insufficient size ({len(numeric_data)})")
if len(filtered_data) < 2:
print(f"Insufficient number of groups for Kruskal-Wallis test in {name}")
return None, None
# Kruskal-Wallis Test
F, p = stats.kruskal(*filtered_data) F, p = stats.kruskal(*filtered_data)
print(f"F-stats for {name}: {F}") print(f"\nF-stats for {name}: {F:.8f}")
print(f"p-value for {name}: {p}") print(f"p-value for {name}: {p:.8f}")
if round(p, 4) > 0.05: if p > 0.05:
print("statistically insignificant\n") print("statistically insignificant\n")
return F, p return F, p
print("statistically significant") print("statistically significant")
tukey_results = stats.tukey_hsd(*filtered_data)
print(tukey_results) # Post-Hoc Dunn Test (Bonferroni-adjusted p-values)
all_ranks = stats.rankdata(all_values) # Rank all values together
group_ranks = [all_ranks[start:start + len(group)] for start, group in
zip(np.cumsum([0] + [len(g) for g in filtered_data[:-1]]), filtered_data)]
posthoc_results = sp.posthoc_conover(filtered_data, p_adjust='bonferroni')
results = []
total_sample_size = len(all_values)
for group1, group2 in itertools.combinations(group_names, 2):
idx1 = group_names.index(group1)
idx2 = group_names.index(group2)
mean_rank_1 = np.mean(group_ranks[idx1])
mean_rank_2 = np.mean(group_ranks[idx2])
rank_diff = mean_rank_1 - mean_rank_2
n1 = len(filtered_data[idx1])
n2 = len(filtered_data[idx2])
# Effect size (Rank-Biserial Correlation)
z_stat = rank_diff / np.sqrt((n1 + n2) * (n1 * n2) / total_sample_size)
effect_size = z_stat / np.sqrt(total_sample_size)
# Mean difference
mean_diff = np.mean(filtered_data[idx1]) - np.mean(filtered_data[idx2])
# Median difference
median_diff = np.median(filtered_data[idx1]) - np.median(filtered_data[idx2])
# Post-Hoc Dunn p-value
p_value = posthoc_results.loc[idx1 + 1, idx2 + 1]
results.append({
"Skupina 1": group1,
"Skupina 2": group2,
"Veľkosť účinku": f"{effect_size:.4f}",
"Rozdiel priemerov": f"{mean_diff:.4f}",
"Rozdiel mediánov": f"{median_diff:.4f}",
"Post-Hoc p-hodnota": f"{p_value:.4f}"
})
results_df = pd.DataFrame(results, dtype="object")
print("\nSummary Table of Effect Size, Mean, and Median Differences:")
print(results_df.to_markdown(index=False, tablefmt="github", disable_numparse=True))
print("")
return F, p return F, p
@ -60,25 +113,52 @@ def plot_violin(data, labels, Fs, ps, title):
index = j * 2 + k index = j * 2 + k
step = 1 if index > 0 else 0.5 step = 1 if index > 0 else 0.5
axs[j, k].violinplot(data[index], showmedians=True) parts = axs[j, k].violinplot(data[index], showmedians=True, showmeans=True)
axs[j, k].set_title(grade_names[index]) axs[j, k].set_title(grade_names[index])
axs[j, k].set_xlabel(title, fontweight="bold") axs[j, k].set_xlabel(title, fontweight="bold")
axs[j, k].set_ylabel(grade_name_labels[index], fontweight="bold") axs[j, k].set_ylabel(grade_name_labels[index], fontweight="bold")
# q1-q3 lines
for ind, vec in enumerate(data[index]):
quartile1, median, quartile3 = np.percentile(vec, [25, 50, 75])
if quartile1 == quartile3:
if quartile1 >= 0.1:
quartile1 -= 0.1
if quartile3 <= max(vec) - 0.1:
quartile3 += 0.1
axs[j, k].vlines(ind + 1, quartile1, quartile3, color="gray", linewidths=3)
axs[j, k].set_xticks(np.arange(1, len(labels) + 1), labels=labels) axs[j, k].set_xticks(np.arange(1, len(labels) + 1), labels=labels)
axs[j, k].set_yticks(np.arange(1, 5.01, step)) axs[j, k].set_yticks(np.arange(1, 5.01, step))
F = round(Fs[index], 2) parts["cmeans"].set_color("red")
p = round(ps[index], 4) parts["cmedians"].set_color("green")
axs[j, k].text(0.01, 0.99, f"F-stat: {F:.2f}\np-val: {p:.4f}", ha="left", va="top", transform=axs[j, k].transAxes,
for i, part in enumerate(parts["bodies"]):
part.set_facecolor(colors[i % len(colors)])
part.set_edgecolor(edge_colors[i % len(edge_colors)])
F = Fs[index]
p = ps[index]
axs[j, k].text(0.01, 0.99, f"F-stat: {F:.4f}\np-val: {p:.4f}", ha="left", va="top",
transform=axs[j, k].transAxes,
fontweight="bold") fontweight="bold")
axs[j, k].text(0.99, 0.99,
f"Na ľavo - priemer (červená)\nNa pravo - medián (zelená)\nSivá - medzi kvartilom 1 a 3",
ha="right",
va="top",
transform=axs[j, k].transAxes)
medians = list([np.median(a) for a in data[index]]) medians = list([np.median(a) for a in data[index]])
for l in range(len(medians)): means = list([a.mean() for a in data[index]])
median = round(medians[l], 2) for l in range(len(data[index])):
axs[j, k].text(l + 1.05, median + 0.05, f"{median}") median = medians[l]
mean = means[l]
# left - mean, right - median
axs[j, k].text(l + 1.13, median - 0.05, f"{median:.2f}", color="green")
axs[j, k].text(l + 0.90 - len(labels) * 0.065, mean - 0.05, f"{mean:.2f}", color="red")
fig.tight_layout() fig.tight_layout()
fig.show()
if save != "": if save != "":
plt.savefig(save) plt.savefig(save)
else: else:

19
analyze.sh Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/bash
find results ! -name 'train.txt' -type f -exec rm -f {} +
./venv/bin/python3 distribution.py --graph --save | tee results/distribution.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_sex.py --graph --save "results/Figure_13.png" | tee results/sex.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_ses.py --graph --save "results/Figure_14.png" | tee results/ses.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_occupation.py --graph --save "results/Figure_15.png" | tee results/occupation.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_living.py --graph --save "results/Figure_16.png" | tee results/living.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_commute.py --graph --save "results/Figure_17.png" | tee results/commute.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_sleep.py --graph --save "results/Figure_18.png" | tee results/sleep.txt
echo -e "\n\n\n\n"
./venv/bin/python3 analyze_absence.py --graph --save "results/Figure_19.png" | tee results/absence.txt

View File

@ -6,8 +6,13 @@ import matplotlib.pyplot as plt
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-g", "--graph", action="store_true", default=False, help="Plot graph") parser.add_argument("-g", "--graph", action="store_true", default=False, help="Plot graph")
parser.add_argument("-s", "--save", default="", help="Graph save location")
args = parser.parse_args() args = parser.parse_args()
graph = args.graph graph = args.graph
save = args.save
colors = ["lightblue", "lightgreen", "lightcoral"]
edge_colors = ["blue", "green", "red"]
dataset = np.load("clean.npy") dataset = np.load("clean.npy")
print(f"dataset shape: {dataset.shape}; analyzing column 11 (absence)") print(f"dataset shape: {dataset.shape}; analyzing column 11 (absence)")
@ -52,31 +57,37 @@ for j in range(2):
index = j * 2 + k index = j * 2 + k
step = 1 if index > 0 else 0.5 step = 1 if index > 0 else 0.5
if index == 0: if not index:
axs[j, k].scatter(dataset[:, 11], dataset[:, 2]) x = data[index][0] # absence
y = data[index][1] # grade
axs[j, k].scatter(x, y)
axs[j, k].set_xlabel("Počet vymeškaných hodín") axs[j, k].set_xlabel("Počet vymeškaných hodín")
axs[j, k].set_ylabel(grade_name_labels[index]) axs[j, k].set_ylabel(grade_name_labels[index])
axs[j, k].set_yticks(np.arange(1, 6))
# trendline
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
axs[j, k].plot(x, p(x), color="gray")
else: else:
current = list([data[index][0][data[index][1] == i + 1] for i in range(5)]) # i wanna kms by_grade = list([data[index][0][data[index][1] == i + 1] for i in range(5)])
axs[j, k].violinplot(list(filter(lambda x: len(x), current)), showmeans=True) # data[index][0] - absences
axs[j, k].set_xticks(np.arange(1, 6, 1), labels=["1", "2", "3", "4", "5"]) # data[index][1] - grades
axs[j, k].set_xlabel(grade_name_labels[index]) # data[index][0][specific grade] - absences for that specific grande
axs[j, k].set_ylabel("Počet vymeškaných hodín") # loop 1 through 5 plug in ^^
axs[j, k].boxplot(by_grade, tick_labels=["1", "2", "3", "4", "5"])
axs[j, k].set_title(grade_names[index]) axs[j, k].set_title(grade_names[index])
tau = round(taus[index], 2) tau = taus[index]
p = round(ps[index], 4) p = ps[index]
axs[j, k].text(0.01, 0.99, f"Tau τ: {tau:.2f}\np-val: {p:.4f}", ha="left", va="top", transform=axs[j, k].transAxes, axs[j, k].text(0.01, 0.99, f"Tau τ: {tau:.4f}\np-val: {p:.4f}", ha="left", va="top",
transform=axs[j, k].transAxes,
fontweight="bold") fontweight="bold")
if index:
by_grade = [data[index][0][data[index][1] == i + 1] for i in range(5)]
means = list([a.mean() for a in filter(lambda b: len(b), by_grade)])
for l in range(len(means)):
mean = round(means[l], 2)
axs[j, k].text(l + 1.02, mean + 5, f"{mean}")
fig.tight_layout() fig.tight_layout()
fig.show() if save != "":
plt.show() plt.savefig(save)
else:
plt.show()

View File

@ -7,8 +7,11 @@ parser = argparse.ArgumentParser(
prog="distribution" prog="distribution"
) )
parser.add_argument("-g", "--graph", action="store_true", default=False, help="Display graphs") parser.add_argument("-g", "--graph", action="store_true", default=False, help="Display graphs")
parser.add_argument("-s", "--save", action="store_true", default=False, help="Save graphs")
args = parser.parse_args() args = parser.parse_args()
graph = args.graph graph = args.graph
save = args.save
graph_index = 1
dataset = np.load("clean.npy") dataset = np.load("clean.npy")
print(f"dataset shape: {dataset.shape}; analyzing distribution\n") print(f"dataset shape: {dataset.shape}; analyzing distribution\n")
@ -19,6 +22,10 @@ def percent(fraction: float) -> str:
def plot_pie(data, labels, title, explode=None): def plot_pie(data, labels, title, explode=None):
global graph_index
if not graph:
return
i = 0 i = 0
while i < len(data): while i < len(data):
if data[i] == 0: if data[i] == 0:
@ -32,10 +39,18 @@ def plot_pie(data, labels, title, explode=None):
plt.title(title) plt.title(title)
plt.tight_layout() plt.tight_layout()
plt.show() if save:
plt.savefig(f"results/Figure_{graph_index}.png")
graph_index += 1
else:
plt.show()
def plot_hist(data, title, xlabel, ylabel): def plot_hist(data, title, xlabel, ylabel):
global graph_index
if not graph:
return
plt.figure(figsize=(8, 6)) plt.figure(figsize=(8, 6))
plt.hist(data, 25, edgecolor="black") plt.hist(data, 25, edgecolor="black")
plt.title(title) plt.title(title)
@ -43,7 +58,11 @@ def plot_hist(data, title, xlabel, ylabel):
plt.ylabel(ylabel) plt.ylabel(ylabel)
plt.tight_layout() plt.tight_layout()
plt.show() if save:
plt.savefig(f"results/Figure_{graph_index}.png")
graph_index += 1
else:
plt.show()
grade = dataset[:, 0] grade = dataset[:, 0]
@ -62,12 +81,11 @@ print(f"4st year: {percent(grade_dist[3])}")
print(f"5st year: {percent(grade_dist[4])}") print(f"5st year: {percent(grade_dist[4])}")
print("") print("")
if graph: plot_pie(
plot_pie( grade_dist,
grade_dist, ["Prvý ročník", "Druhý ročník", "Tretí ročník", "Štvrtý ročník", "Piaty ročník"],
["Prvý ročník", "Druhý ročník", "Tretí ročník", "Štvrtý ročník", "Piaty ročník"], "Distribúcia ročníkov",
"Distribúcia ročníkov", )
)
sex = dataset[:, 1] sex = dataset[:, 1]
sex_dist = [ sex_dist = [
@ -79,15 +97,13 @@ print(f"Female: {percent(sex_dist[0])}")
print(f"Male: {percent(sex_dist[1])}") print(f"Male: {percent(sex_dist[1])}")
print("") print("")
if graph: plot_pie(sex_dist, ["Ženy", "Muži"], "Distribúcia pohlavia")
plot_pie(sex_dist, ["Ženy", "Muži"], "Distribúcia pohlavia")
print("--- GPA ---") print("--- GPA ---")
print("n/a") print("n/a")
print("") print("")
if graph: plot_hist(dataset[:, 2], "Distribúcia piemernu známok", "Piemerná známka", "Počet študentov/tiek")
plot_hist(dataset[:, 2], "Distribúcia piemernu známok", "Piemerná známka", "Počet študentov/tiek")
math = dataset[:, 3] math = dataset[:, 3]
math_dist = [ math_dist = [
@ -105,8 +121,7 @@ print(f"4: {percent(math_dist[3])}")
print(f"5: {percent(math_dist[4])}") print(f"5: {percent(math_dist[4])}")
print("") print("")
if graph: plot_pie(math_dist, ["1", "2", "3", "4", "5"], "Distribúcia známok z matematiky")
plot_pie(math_dist, ["1", "2", "3", "4", "5"], "Distribúcia známok z matematiky")
slovak = dataset[:, 4] slovak = dataset[:, 4]
slovak_dist = [ slovak_dist = [
@ -124,8 +139,7 @@ print(f"4: {percent(slovak_dist[3])}")
print(f"5: {percent(slovak_dist[4])}") print(f"5: {percent(slovak_dist[4])}")
print("") print("")
if graph: plot_pie(slovak_dist, ["1", "2", "3", "4", "5"], "Distribúcia známok zo slovenčiny", (0, 0, 0, 0.25, 0.5))
plot_pie(slovak_dist, ["1", "2", "3", "4", "5"], "Distribúcia známok zo slovenčiny", (0, 0, 0, 0.25, 0.5))
english = dataset[:, 5] english = dataset[:, 5]
english_dist = [ english_dist = [
@ -143,8 +157,7 @@ print(f"4: {percent(english_dist[3])}")
print(f"5: {percent(english_dist[4])}") print(f"5: {percent(english_dist[4])}")
print("") print("")
if graph: plot_pie(english_dist, ["1", "2", "3", "4", "5"], "Distribúcia známok z angličtiny")
plot_pie(english_dist, ["1", "2", "3", "4", "5"], "Distribúcia známok z angličtiny")
ses = dataset[:, 6] ses = dataset[:, 6]
ses_dist = [ ses_dist = [
@ -158,8 +171,7 @@ print(f"Middle: {percent(ses_dist[1])}")
print(f"Upper: {percent(ses_dist[2])}") print(f"Upper: {percent(ses_dist[2])}")
print("") print("")
if graph: plot_pie(ses_dist, ["Nižšia trieda", "Stredná trieda", "Vyššia trieda"], "Distribúcia socio-ekonomických tried")
plot_pie(ses_dist, ["Nižšia trieda", "Stredná trieda", "Vyššia trieda"], "Distribúcia socio-ekonomických tried")
occupation = dataset[:, 7] occupation = dataset[:, 7]
occupation_dist = [ occupation_dist = [
@ -179,10 +191,9 @@ print(f"other : {percent(occupation_dist[4])}")
print(f"none : {percent(occupation_dist[5])}") print(f"none : {percent(occupation_dist[5])}")
print("") print("")
if graph: plot_pie(occupation_dist,
plot_pie(occupation_dist, ["Práca 10 a viac hodín týždenne", "Práca menej ako 10 hodín týždenne", "Šport", "Hudba", "Niečo iné",
["Práca 10 a viac hodín týždenne", "Práca menej ako 10 hodín týždenne", "Šport", "Hudba", "Niečo iné", "Žiadne"], "Distribúcia práce a aktivít")
"Žiadne"], "Distribúcia práce a aktivít")
living = dataset[:, 8] living = dataset[:, 8]
living_dist = [ living_dist = [
@ -200,10 +211,9 @@ print(f"dorms : {percent(living_dist[3])}")
print(f"other : {percent(living_dist[4])}") print(f"other : {percent(living_dist[4])}")
print("") print("")
if graph: plot_pie(living_dist,
plot_pie(living_dist, ["S rodinou", "S rodinným príslušníkom/ou", "Sám/a alebo so spolubývajúcim/ou", "Intrák", "Iné"],
["S rodinou", "S rodinným príslušníkom/ou", "Sám/a alebo so spolubývajúcim/ou", "Intrák", "Iné"], "Distribúcia životných situácií")
"Distribúcia životných situácií")
commute = dataset[:, 9] commute = dataset[:, 9]
commute_dist = [ commute_dist = [
@ -221,10 +231,9 @@ print(f"<= 1h : {percent(commute_dist[3])}")
print(f"> 1h : {percent(commute_dist[4])}") print(f"> 1h : {percent(commute_dist[4])}")
print("") print("")
if graph: plot_pie(commute_dist,
plot_pie(commute_dist, ["Intrák", "Menej ako 15 minút", "Menej ako 30 minút", "Menej ako hodinu", "Viac ako hodinu"],
["Intrák", "Menej ako 15 minút", "Menej ako 30 minút", "Menej ako hodinu", "Viac ako hodinu"], "Distribúcia dochádzania")
"Distribúcia dochádzania")
sleep = dataset[:, 10] sleep = dataset[:, 10]
sleep_dist = [ sleep_dist = [
@ -238,12 +247,10 @@ print(f"medium sleepers: {percent(sleep_dist[1])}")
print(f"long sleepers : {percent(sleep_dist[2])}") print(f"long sleepers : {percent(sleep_dist[2])}")
print("") print("")
if graph: plot_pie(sleep_dist, ["6 hodín a menej", "7 až 8 hodín", "9 a viac hodín"], "Distribúcia spánku")
plot_pie(sleep_dist, ["6 hodín a menej", "7 až 8 hodín", "9 a viac hodín"], "Distribúcia spánku")
print("--- ABSENCE ---") print("--- ABSENCE ---")
print("n/a") print("n/a")
print("") print("")
if graph: plot_hist(dataset[:, 11], "Distribúcia absencií", "Počet neprítomných hodín", "Počet študentov/tiek")
plot_hist(dataset[:, 11], "Distribúcia absencií", "Počet neprítomných hodín", "Počet študentov/tiek")

View File

@ -25,16 +25,23 @@ nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.4.127 nvidia-nvtx-cu12==12.4.127
packaging==24.2 packaging==24.2
pandas==2.2.3 pandas==2.2.3
pandas-flavor==0.6.0
patsy==1.0.1
pillow==11.0.0 pillow==11.0.0
pyparsing==3.2.0 pyparsing==3.2.0
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
pytz==2024.2 pytz==2024.2
scikit-learn==1.6.0 scikit-learn==1.6.0
scikit-posthocs==0.11.2
scipy==1.14.1 scipy==1.14.1
seaborn==0.13.2
setuptools==75.6.0 setuptools==75.6.0
six==1.17.0 six==1.17.0
statsmodels==0.14.4
sympy==1.13.1 sympy==1.13.1
tabulate==0.9.0
threadpoolctl==3.5.0 threadpoolctl==3.5.0
torch==2.5.1 torch==2.5.1
typing_extensions==4.12.2 typing_extensions==4.12.2
tzdata==2024.2 tzdata==2024.2
xarray==2024.11.0

View File

@ -119,7 +119,7 @@ for epoch in range(epochs):
pred = model(X) pred = model(X)
loss = loss_fn(pred, y) loss = loss_fn(pred, y)
test_loss = loss.item() * X.size(0) test_loss += loss.item() * X.size(0)
test_loss /= len(test_dataset) test_loss /= len(test_dataset)
test_losses.append(test_loss) test_losses.append(test_loss)