-
Notifications
You must be signed in to change notification settings - Fork 0
/
utilities.py
99 lines (81 loc) · 3.73 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from typing import List
import pandas as pd
import os
import argparse
def dummy_encode(df: pd.core.frame.DataFrame, cat_cols : List[str] )-> pd.core.frame.DataFrame:
'''
Will find columns from second parameter that corresponds to
column names in pandas and use dummy encoding scheme from
pandas to encode, then return the new dataframe
:param df: Dataframe to work with
:param cat_cols: list of columns that needs to be changed
:return same dataframe, encoded if required
'''
num_rows_original = df.shape[0]
if cat_cols == 0:
return df
else:
new_df = pd.get_dummies(df, columns=cat_cols)
num_rows_new = new_df.shape[0]
if num_rows_original != num_rows_new:
print(f"\n There might to have been data loss during encoding. \n Row counts do not match \n")
return new_df
def find_cat(df: pd.core.frame.DataFrame) -> str:
'''
Iterates through columns to find ones that contains strings
** Only looks at first value, hence assumes all column is
of same type..
df: pandas dataframe to search in
'''
cat_cols = []
for col_name, col_data in df.iteritems():
if type(col_data.values[0]) == str:
cat_cols.append(col_name)
return cat_cols
def adjust_enc_errors(df: pd.core.frame.DataFrame, df2: pd.core.frame.DataFrame) -> (pd.core.frame.DataFrame, pd.core.frame.DataFrame):
'''
When encoding with get_dummies, data and labels might return different values
since the some domain in each column might not be exactly the same
Solution is to add the missing values in the each pandas
'''
missing_cols = set( df.columns ) - set(df2.columns)
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
df2[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
labels_df = df2[df.columns]
# # then the other way around
# missing_cols = set( df2.columns ) - set( df.columns )
# # Add a missing column in test set with default value equal to 0
# for c in missing_cols:
# df[c] = 0
# # Ensure the order of column in the test set is in the same order than in train set
# data_df = df[df2.columns]
return df, labels_df
def check_dir_path(path_to_check: str) -> str:
'''
Checks if provided path is currently a at current level directory.
If it is, it appends a number to the end and checks again
until no directory with such name exists
path_to_check: str The path to location to check
return: str New path with which os.mkdir can be called
'''
new_path = path_to_check
if os.path.isdir(path_to_check):
print("Experiment with name: \'{}\' already exists. Appending int to folder name. \n ".format(path_to_check))
if os.path.isdir(path_to_check):
expand = 1
while True:
expand += 1
new_path = path_to_check[:-1] + '_' + str(expand) + '/'
if os.path.isdir(new_path):
continue
else:
break
print(f"Experiment name: {new_path} \n \n ")
return new_path
def parse_arguments(parser):
parser.add_argument('--batch_size', type=int, default=128, help='The dimension size of the embedding, which will be generated by the generator. (default value: 128)')
parser.add_argument('--generate_data', type=str2bool, default=False, help='If True the model generates data, if False the model is trained (default value: False)')
args = parser.parse_args()
return args