-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
82 lines (59 loc) · 2.61 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
import pandas as pd
import scipy.stats as stats
from settings.config import JsonFile
category_order = dict(JsonFile(path="settings/config_values.json").file)
for var in category_order.keys():
category_order[var].insert(0, "Var")
def remove_duplicates(x: list) -> list:
return np.unique(x).tolist()
def interpret_instruction(instruction: str) -> tuple:
result = instruction.split(",")
if result[1] == "${multiply}":
result.append("*")
return tuple(result)
def calculate_percentages(structure: dict | None, df: pd.DataFrame, table_type: int) -> pd.DataFrame:
if table_type == 1:
columns_to_convert = [x for x in structure if x != "Var"]
df_selected = df[columns_to_convert]
divider = df["Total"].sum()
df[columns_to_convert] = df_selected.div(divider, axis=0)
df["Total"] = df["Total"] / divider
elif table_type == 2:
columns_to_convert = [x for x in structure if x != "Var"]
df_selected = df[columns_to_convert]
divider = df_selected.sum()
df[columns_to_convert] = df_selected.div(divider, axis=1)
df["Total"] = df["Total"] / df["Total"].sum(axis=0)
elif table_type == 3:
columns_to_convert = [x for x in structure if x not in ("Var", "Total")]
df_selected = df[columns_to_convert]
divider = df_selected.sum(axis=1)
df[columns_to_convert] = df_selected.div(divider, axis=0)
df["Total"] = df["Total"].div(df["Total"], axis=0)
else:
df.drop([x for x in df.columns.tolist() if x not in ("Var", "Total")], axis=1, inplace=True)
df["%"] = df["Total"] / df["Total"].sum(axis=0)
return df
def chi2_test(df: pd.DataFrame, alpha=0.05) -> tuple:
df.drop(columns=["Total", "Var"], inplace=True)
df.drop(df.index[-1], inplace=True)
observed = np.array(df)
try:
_, p_val, _, _ = stats.chi2_contingency(observed)
except ValueError:
return (False, "The internally computed table of expected frequencies has a zero element.",
"Null")
if p_val < alpha:
return (True,
"* The Chi-squeare statistic is significant at the .05 level.",
f"p-value: {np.round(p_val, 3)}")
else:
return (False,
"* The Chi-squeare statistic is not significant at the .05 level.",
f"p-value: {np.round(p_val, 3)}")
def insert_total_row(df: pd.DataFrame) -> pd.DataFrame:
_sum = df.drop(columns="Var").sum()
row_total = pd.DataFrame(_sum).T
row_total["Var"] = "Total"
return pd.concat([df, row_total], ignore_index=True)