diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..20e5888 --- /dev/null +++ b/analyze.py @@ -0,0 +1,18 @@ +from typing import List + +import numpy as np +import scipy.stats as stats + + +def analyze(name: str, data: List[np.ndarray]): + F, p = stats.f_oneway(*data) + print(f"F-stats for {name}: {F}") + print(f"p-value for {name}: {p}") + + if p > 0.05: + print("statistically insignificant\n") + return + + print("statistically significant") + tukey_results = stats.tukey_hsd(*data) + print(tukey_results) diff --git a/analyze_occupation.py b/analyze_occupation.py index 5aa446c..f322934 100644 --- a/analyze_occupation.py +++ b/analyze_occupation.py @@ -1,5 +1,6 @@ import numpy as np -import scipy.stats as stats + +from analyze import analyze dataset = np.load("clean.npy") print(f"dataset shape: {dataset.shape}, analyzing column 6 (sex)") @@ -12,22 +13,19 @@ print("\t5 - none") print("") -def analyze(name: str, col: np.ndarray): +def analyze_occupation(name: str, col: np.ndarray): occupation_col = dataset[:, 6] - F, p = stats.f_oneway(col[occupation_col == 0], col[occupation_col == 1], col[occupation_col == 2], col[occupation_col == 3], col[occupation_col == 4], col[occupation_col == 5]) - print(f"F-stats for {name}: {F}") - print(f"p-value for {name}: {p}") - - if p > 0.05: - print("statistically insignificant\n") - return - - print("statistically significant") - tukey_results = stats.tukey_hsd(col[occupation_col == 0], col[occupation_col == 1], col[occupation_col == 2], col[occupation_col == 3], col[occupation_col == 4], col[occupation_col == 5]) - print(tukey_results) + analyze(name, [ + col[occupation_col == 0], + col[occupation_col == 1], + col[occupation_col == 2], + col[occupation_col == 3], + col[occupation_col == 4], + col[occupation_col == 5] + ]) -analyze("gpa", dataset[:, 2]) -analyze("math", dataset[:, 3]) -analyze("slovak", dataset[:, 4]) -analyze("english", dataset[:, 5]) +analyze_occupation("gpa", dataset[:, 2]) +analyze_occupation("math", dataset[:, 3]) +analyze_occupation("slovak", dataset[:, 4]) +analyze_occupation("english", dataset[:, 5]) diff --git a/analyze_sex.py b/analyze_sex.py index 50c763b..c47e489 100644 --- a/analyze_sex.py +++ b/analyze_sex.py @@ -1,5 +1,6 @@ import numpy as np -import scipy.stats as stats + +from analyze import analyze dataset = np.load("clean.npy") print(f"dataset shape: {dataset.shape}, analyzing column 1 (sex)") @@ -8,19 +9,12 @@ print("\t1 - male") print("") -def analyze(name: str, col: np.ndarray): +def analyze_sex(name: str, col: np.ndarray): sex_col = dataset[:, 1] - F, p = stats.f_oneway(col[sex_col == 0], col[sex_col == 1]) - print(f"F-stats for {name}: {F}") - print(f"p-value for {name}: {p}") - - if p > 0.05: - print("statistically insignificant\n") - return - - print("statistically significant") - tukey_results = stats.tukey_hsd(col[sex_col == 0], col[sex_col == 1]) - print(tukey_results) + analyze(name, [ + col[sex_col == 0], + col[sex_col == 1] + ]) analyze("gpa", dataset[:, 2])