state-soc-cross/analysis.py

from typing import List
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

counties = {
    "BA": 0,
    "TN": 1,
    "TT": 2,
    "NR": 3,
    "BB": 4,
    "ZA": 5,
    "PO": 6,
    "KE": 7
}
counties_k = list(counties.keys())


def map_counties(arr: List[str]) -> List[int]:
    ret = []
    for county in arr:
        ret.append(counties[county])
    return ret


raw_data = []
with open("parsed.txt") as stream:
    for line in stream.readlines():
        if not line:
            continue

        split = line.strip().split(" ")
        year = int(split[0])
        category = int(split[1])
        wins_raw = split[2].split(",")

        raw_data.append([year, category, *map_counties(wins_raw)])


# 0 - year
# 1 - abteilung id
# 2-7 - first to last place county ids
data = np.array(raw_data)

print("Testing place distribution for normality by county")
for id in range(8):
    places = []
    for sample in data:
        for i, v in enumerate(sample[2:7]):
            if v == id:
               places.append(i)

    # null hypothesis is that the sample comes from a normal distribution
    F, p = stats.normaltest(places)
    print(f"{counties_k[id]}: {p:.4f} - {"not " if p < 0.05 else ""}normally distributed")