-
Notifications
You must be signed in to change notification settings - Fork 0
/
gadget.py
112 lines (76 loc) · 2.62 KB
/
gadget.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
本檔案包含了數個「獨立測試」的函式,和主程式執行無關。
詳細功能與輸入、輸出請見各函式的 docstring。
"""
from os.path import join
from pandas import DataFrame, read_csv, concat
from typing import Tuple
from ydata_profiling import ProfileReport
from utils import get_df
def check_data_nan():
""" Check the given data has NaN values or not.
"""
data = read_csv('outputs/predictions/HAHAHAA-10.csv')
print(f"The length of the given data: {len(data)}")
print("The given data has below columns: ")
for column_name in data.columns.values:
print(column_name)
nan_values_number = data[column_name].isna().sum()
if nan_values_number > 0:
print(f"Notice: {column_name} has {nan_values_number} NaN values.")
def concat_data(dfs: Tuple[DataFrame]) -> DataFrame:
""" Concatenate the given dataframes.
Args:
dfs (Tuple[DataFrame]): The given dataframes.
Returns:
DataFrame: The concatenated dataframe.
"""
return DataFrame(concat(objs=dfs, axis=0))
def concat_and_save_data():
""" Concatenate the given dataframes and save the result.
"""
dfs = load_data_x()
df = concat_data(dfs)
save_data(df=df)
def do_eda():
""" Analyze the data and generate a report.
"""
df = read_csv(filepath_or_buffer='data/train.csv')
fraud_df = df.loc[df.label == 1]
report = ProfileReport(df=df, title='Data Analysis (training.csv)')
report.to_file(
output_file=
'outputs/EDAs/ydata-profiling/data_analysis (training.csv).html')
report = ProfileReport(df=fraud_df,
title='Data Analysis (only fraud data)')
report.to_file(
output_file='outputs/EDAs/ydata-profiling/fraud_data_analysis.html')
def get_small_data():
""" Get small data for testing.
"""
data = get_df(data_name="new_public")
data = data.head(100000)
# Need to change the save file name in save_data() function.
save_data(df=data)
def load_data_x() -> Tuple[DataFrame]:
""" Load two dataframes.
Returns:
Tuple[DataFrame]: The two dataframes.
"""
data_1 = get_df(data_name="old_train")
data_2 = get_df(data_name="new_public")
return data_1, data_2
def save_data(df: DataFrame):
""" Save the concatenated dataframe.
Args:
df (DataFrame): The concatenated dataframe.
"""
file = join("data", "train.csv")
df.to_csv(path_or_buf=file, index=False)
if __name__ == "__main__":
""" Main function.
"""
# check_data_nan()
# concat_and_save_data()
# do_eda()
# get_small_data()