soc-2024/analyze_occupation.py
2024-12-15 11:55:40 +01:00

34 lines
1.1 KiB
Python

import numpy as np
import scipy.stats as stats
dataset = np.load("clean.npy")
print(f"dataset shape: {dataset.shape}, analyzing column 6 (sex)")
print("\t0 - work hours / week >= 10")
print("\t1 - work hours / week < 10")
print("\t2 - sport")
print("\t3 - music")
print("\t4 - other")
print("\t5 - none")
print("")
def analyze(name: str, col: np.ndarray):
occupation_col = dataset[:, 6]
F, p = stats.f_oneway(col[occupation_col == 0], col[occupation_col == 1], col[occupation_col == 2], col[occupation_col == 3], col[occupation_col == 4], col[occupation_col == 5])
print(f"F-stats for {name}: {F}")
print(f"p-value for {name}: {p}")
if p > 0.05:
print("statistically insignificant\n")
return
print("statistically significant")
tukey_results = stats.tukey_hsd(col[occupation_col == 0], col[occupation_col == 1], col[occupation_col == 2], col[occupation_col == 3], col[occupation_col == 4], col[occupation_col == 5])
print(tukey_results)
analyze("gpa", dataset[:, 2])
analyze("math", dataset[:, 3])
analyze("slovak", dataset[:, 4])
analyze("english", dataset[:, 5])