-
Notifications
You must be signed in to change notification settings - Fork 0
/
Reg.py
65 lines (56 loc) · 2.46 KB
/
Reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Import block
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from preprocessing_wrapper import load_preprocessed_data
from PCA import airbnb_PCA_n
from Cross_validation import cross_validation
data = load_preprocessed_data()
data = data.drop(columns = ["Listing_ID", "Host_ID", "Postal_Code"])
# Create X and Y, the target value from data
X = data.drop(columns=['Price'])
Y = data[['Price']]
features = data.columns.drop("Price")
target = 'Price'
def stratify(X, Y, field):
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2)
x_train_strat, x_test_strat, y_train_strat, y_test_strat = train_test_split(X, Y, stratify = X[field], test_size = 0.2)
def accomodation_proportions(data, field):
return data[field].value_counts() / len(data)
compare_props = pd.DataFrame({
"Input_dataset": accomodation_proportions(X, 'Accomodates'),
"Test_set": accomodation_proportions(x_test, 'Accomodates'),
"Strat_set": accomodation_proportions(x_test_strat, 'Accomodates')
}).sort_index()
compare_props["Test set. %error"] = 100 * compare_props["Test_set"] / compare_props["Input_dataset"] - 100
compare_props["Strat test set. %error"] = 100 * compare_props["Strat_set"] / compare_props["Input_dataset"] - 100
print(compare_props)
return(x_train_strat, x_test_strat, y_train_strat, y_test_strat)
def Reg(stratify:bool=False, field:str=None, pca:bool=False):
data = load_preprocessed_data()
data = data.drop(columns = ["Listing_ID", "Host_ID", "Postal_Code"])
X = data.drop(columns=['Price'])
Y = data[['Price']]
# Create the training and testing set
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2, random_state = None)
if stratify and field!=None:
x_train, x_test, y_train, y_test = stratify(X,Y,field)
if pca:
x_train, x_test = airbnb_PCA_n(x_train, x_test, 80)
# Create regressor
clf = LinearRegression()
# Fit on the training set
clf.fit(x_train, y_train)
# Create predictions
pred = clf.predict(x_test)
# Print the accuracy and root mean square error
acc = clf.score(x_test,y_test)
print(acc)
rmse = metrics.mean_squared_error(y_test, pred, squared = False)
return(rmse)
print(Reg(pca=True))
print(Reg(pca=False))
print(cross_validation(LinearRegression(), 5, X, Y, pca = True))