-
Notifications
You must be signed in to change notification settings - Fork 1
/
Main_and_Data_Cleaning.py
105 lines (72 loc) · 3.09 KB
/
Main_and_Data_Cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from collections import namedtuple
from linear_regression import Linear_Regression
import matplotlib.pyplot as plt
import pandas as pd
import csv
def main():
variables = ["date", "meantempm", "meandewptm", "meanpressurem", "maxhumidity", "minhumidity", "maxtempm",
"mintempm", "maxdewptm", "mindewptm", "maxpressurem", "minpressurem", "precipm"]
records = []
AnnualWeatherReport = namedtuple("AnnualWeatherReport", variables)
with open("ottawa_raw_data.txt", "r") as raw_data:
r = csv.reader(raw_data, delimiter=",")
next(r)
for row in r:
records.append(AnnualWeatherReport(
date=row[0],
meantempm=row[1],
meandewptm=row[2],
meanpressurem=row[3],
maxhumidity=row[4],
minhumidity=row[5],
maxtempm=row[6],
mintempm=row[7],
maxdewptm=row[8],
mindewptm=row[9],
maxpressurem=row[10],
minpressurem=row[11],
precipm=row[12]
))
df = pd.DataFrame(records, columns=variables).set_index('date')
def add_nth_day_prior_features(df, target_variable, days_prior):
rows = df.shape[0]
nth_prior_values = [None] * days_prior + [df[target_variable][i - days_prior] for i in range(days_prior, rows)]
col_name = "{}_{}".format(target_variable, days_prior)
df[col_name] = nth_prior_values
for target_variable in variables:
if target_variable != 'date':
for i in range(1, 4):
add_nth_day_prior_features(df, target_variable, i)
variables_removed = [variable for variable in variables if variable not in ['meantempm', 'mintempm', 'maxtempm']]
variables_maintained = [col for col in df.columns if col not in variables_removed]
df = df[variables_maintained]
df = df.apply(pd.to_numeric, errors='coerce')
# print(df.info())
spread = df.describe().T
IQR = spread['75%'] - spread['25%']
spread['outliers'] = (spread['min'] < (spread['25%'] - (3 * IQR))) | (spread['max'] > (spread['75%'] + 3 * IQR))
# print(spread.loc[spread.outliers])
plt.rcParams['figure.figsize'] = [14, 8]
df.maxhumidity_1.hist()
plt.title('Distribution of maxhumidity_1')
plt.xlabel('maxhumidity_1')
# plt.show()
df.minpressurem_1.hist()
plt.title('Distribution of minpressurem_1')
plt.xlabel('minpressurem_1')
# plt.show()
# If Needed
for precip_col in ['precipm_1', 'precipm_2', 'precipm_3']:
# create a boolean array of values representing nans
missing_vals = pd.isnull(df[precip_col])
df[precip_col][missing_vals] = 0
df = df.dropna()
ln = Linear_Regression(df, 0.6)
ln.correlation_assessment()
ln.print_df_corr()
df2 = ln.get_suitable_predictors()
ln.plot_graph(df2)
x = ln.further_filtering_the_predictors(0.05, df2)
y = df2['meantempm']
ln.print_results(x, y)
main()