-
Notifications
You must be signed in to change notification settings - Fork 0
/
use_fun.py
363 lines (238 loc) · 10.5 KB
/
use_fun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import numpy as np
import os
import re
from xlwings import view
import xlwings as xw
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
# %config Completer.use_jedi = False
from datetime import datetime
def now_():
return datetime.now().strftime("%H.%M.%S")
now_()
def my_ls(str_file_ending=None):
import os
lst_return = [i for i in os.listdir() if not i.startswith('~')
if not i.endswith('.ipynb')
if not i.endswith('.ipynb_checkpoints')]
if str_file_ending:
return [i for i in lst_return if i.endswith(str_file_ending)]
return lst_return
def conv_seconds_2_time_stamp(str_in):
hours_ = str_in // (60*60)
str_in = str_in - hours_*60*60
minutes_ = str_in // (60)
seconds_ = str_in - minutes_*60
seconds_ = round(seconds_, 1)
return f'{int(hours_)}:{int(minutes_)}:{seconds_}'
# conv_seconds_2_time_stamp(43200)
def print_lst(lst_, sep_ = " \t| "):
lst_ = [str(i) for i in lst_p]
print(sep_.join(lst_))
def get_fig_frequency(DF, lst_col, title = None, fig_size_x = 18.5, fig_size_y = 10.5):
assert len(lst_col)>0, "List of columns is empty."
assert [i for i in lst_col if i not in DF.columns] == [], "The list of column was not in the list of DF columns!"
if len(lst_col)==1:
print("Because of an interal bug, we generate the plots twice!")
lst_col = lst_col + lst_col
plt.style.use('ggplot')
fig, axs = plt.subplots(len(lst_col),2)
if title:
fig.suptitle(title)
for i, col in enumerate(lst_col):
print(i,col)
axs[i,0].boxplot(DF[col])
axs[i,0].set_title(col)
axs[i,1].hist(DF[col])
axs[i,1].set_title(col)
fig.set_size_inches(fig_size_x, fig_size_y)
return fig
def to_excel_from_dct(dct_df, file_name, add_time_tag = False, close_workbook = False, dct_fig = None):
import xlwings as xw
import os
wb = xw.Book()
xlSrcRange = 1
xlYes = 1
lst_sheets = []
str_assert = "The sheet names can only have 31 charachter. We are only allows to use this part:\n"
if dct_fig:
for name_ in sorted(dct_fig.keys(), reverse=True):
assert len(name_)<=31, str_assert + f"\n{name_[:31]}"
lst_sheets.append(name_)
sh = wb.sheets.add(name_)
sh.pictures.add(dct_fig[name_], name=name_, update=True)
print("Figures \t| Successfully added")
for name_ in sorted(dct_df.keys(), reverse=True):
assert len(name_)<=31, str_assert + f"\n{name_[:31]}"
lst_sheets.append(name_)
sh = wb.sheets.add(name_)
sh.range('A1').options(index = None).value = dct_df[name_]
tbl_range = sh.range("A1").expand('table')
sh.api.ListObjects.Add(xlSrcRange, tbl_range.address, 0, 1).Name = f"tbl_{name_}"
sh.autofit(axis="columns")
print("DataFrames \t| Successfully added")
for sheet in [sh.name for sh in wb.sheets]:
if sheet not in lst_sheets:
print(sheet, "\t| deleted")
wb.sheets[sheet].delete()
if add_time_tag:
from datetime import datetime
file_name = file_name.replace(".xlsx", "")
file_name += " -- " + datetime.now().strftime("%Y-%m-%d %H.%M.%S") + ".xlsx"
print(f'\nPath: \t\t| {os.getcwd()}')
print(f'Filen name: \t| {file_name}')
wb.save(file_name)
if close_workbook:
wb.close()
return file_name
# to_excel_from_dct(dct_df, f"{str_xl_name} ++ fuzyy", add_time_tag = True, close_workbook = False)
def value_counts(DF, str_column):
return (pd.DataFrame(DF[str_column]
.value_counts(dropna=True))
.reset_index()
.rename(columns = {str_column:'count'
, 'index':str_column})
)
# value_counts(df, 'abfahrt_anzahl')
def get_df_log(dct_log, lst_rows_top = [], lst_rows_down = []):
if 'script started at' in dct_log.keys():
dct_log['script duration'] = str(datetime.now() - dct_log['started_at'])
dct_log['script started at'] = str(dct_log['script started at'])
df_log = pd.DataFrame([dct_log])
df_log = reoder_columns(df_log, lst_rows_top , bl_left=True)
df_log = reoder_columns(df_log, lst_rows_down, bl_left=False)
df_log = df_log.T.reset_index()
df_log.rename(columns = {'index':'key', 0:'value'}, inplace= True)
return df_log
def get_now():
from datetime import datetime
return datetime.now().strftime("%Y-%m-%d %H.%M.%S")
def get_time():
from datetime import datetime
return datetime.now().strftime("%H.%M.%S")
def get_date():
from datetime import datetime
return datetime.now().strftime("%Y-%m-%d")
def get_files(filepath, str_ending = '*.json'):
lst_files = []
for root, dirs, files in os.walk(filepath):
files = glob.glob(os.path.join(root, str_ending))
for f in files:
lst_files.append(os.path.abspath(f))
return lst_files
def get_excel_files_from_folder(str_file_address=""):
'''
This function returns all excel files. Based on MS documentations,
there are many excel file extensions:
.xlsx, .xlsm, .xls, .xlsb, .xltx, .xltm, .xlt, .xls, .xlam, .xla, .xlw, .xlr
A good solution would be to use regex. Any file name which has the following patterns,
will be selected:
\.xl..$
\.xl.$
'''
lst_return = []
for item in os.listdir(str_file_address):
# We remove/neglect files which start with ~, because they are backup file in Windows.
if bool(re.search('^~', item)):
continue
elif bool(re.search('(\.xl..?)$', item.lower())):
lst_return.append(item)
return lst_return
def save_excel_from_dictionary(dct_df, str_xl_name, bl_open=False):
'''
This function gets a dictionary of data frames and saves them all in a single excel file.
Sheet names are the same as the keys in the DataFrame.
indices will be removed. So maybe you would prefer to use the command df.reset_index() before
saving them in the Dictionary (=dct_df).
'''
import os
assert str_xl_name.endswith(
".xlsx"), f'The file name (={str_xl_name}) doesnt end with .xlsx'
writer = pd.ExcelWriter(str_xl_name, engine='xlsxwriter')
# Write each dataframe to a different worksheet.
for key_, value_ in dct_df.items():
value_.to_excel(writer, sheet_name=key_, index=None)
writer.save()
# This only works on windows operating system...
if bl_open:
os.system(f'start excel "{str_xl_name}"')
return True
def reoder_columns(df, lst_ordered, bl_left=True):
lst_col = [i for i in df.columns if i not in lst_ordered]
if bl_left:
return df[lst_ordered + lst_col]
else:
return df[lst_col + lst_ordered]
# The following two functions can be used for Root Cause Analysis:
def augment_reason(DF, index_reason, str_reason):
for item in ['reason', 'tried hypothesis']:
if item not in DF.columns:
DF[item] = np.nan
print(f"Column '{item}' added to DataFrame.")
index_final = DF['reason'].isna() & index_reason
DF.loc[index_final, 'reason'] = str_reason
DF.loc[index_reason, 'tried hypothesis'] = DF.loc[index_reason, 'tried hypothesis'].map(str) + " | " + str_reason
print("-----------------------")
print("reason \t|", str_reason)
print(index_reason.sum(), " \t| Count of rows affected by the reason.")
print( index_final.sum(), " \t| Count of rows affected by the reason, which werent affected already.")
print(DF['reason'].isna().sum(), " \t| Count of remaining rows.")
return DF
def augment_comment(DF, index_comment, str_comment):
'''
This function adds comments to the rows, which we havent found any reason for them.
These comment can show our guess about the values. If for example they are dupplicated but we dont know why,
it would be good to write it in the comments. It helps to resume the analysis in a meeting.
'''
assert 'reason' in DF.columns, f"The column reason doesnt exist in the DataFrame."
index_final = DF['reason'].isna() & index_comment
DF.loc[index_final, 'comment'] = str_comment
index_no_comment_no_reason = DF['reason'].isna() & ~index_comment
print("-----------------------")
print("comment |", str_comment)
print(index_comment.sum(), " \t| Count of rows affected by the comment.")
print(index_no_comment_no_reason.sum(), " \t| Count of remaining uncommented AND not reasoned rows.")
return DF
# Sankey example
from ipysankeywidget import SankeyWidget
links = [
{'source': 'start', 'target': 'A', 'value': 2, 'type': 'y'},
{'source': 'start', 'target': 'B', 'value': 20},
{'source': 'B', 'target': 'C', 'value': 5, 'type': 'x'},
{'source': 'B', 'target': 'D', 'value': 15, 'type': 'x'},
]
SankeyWidget(links=links, margins=dict(top=0, bottom=0, left=50, right=100), linkLabelFormat='.0f')
time_start = datetime.now()
def get_deltatime_in_seconds(time_start):
return (datetime.now()-time_start).seconds
def convert_seconds__days_hours_minutes_seconds(td):
str_return = {'day': td//(60*60*24)%30
, 'hour':td//(60*60)%24
, 'minute':(td//60)%60
, 'second':td%60
}
# print(str_return)
return str_return
def get_duration_in_days_hours_minutes_seconds(start_time):
duration_in_seconds = get_deltatime_in_seconds(start_time)
return convert_seconds__days_hours_minutes_seconds(duration_in_seconds)
# get_duration_in_days_hours_minutes_seconds(start_time)
def get_dct_log():
import getpass
from datetime import datetime
return {'start_time': datetime.now(), 'Ran by': getpass.getuser(), 'Current working directory': os.getcwd()}
def get_common_words_in_column(DF, str_col):
set_base = set(DF[str_col][0].lower().split(" "))
for item in DF[str_col]:
set_row = set(item.lower().split(" "))
set_base = {item for item in set_row if item in set_base}
if set_base == {}:
return set_base
return set_base