-
Notifications
You must be signed in to change notification settings - Fork 14
/
displays.py
113 lines (86 loc) · 4.39 KB
/
displays.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import matplotlib.pyplot as pl
import numpy as np
from sklearn.model_selection import learning_curve as curves
from sklearn.model_selection import validation_curve as v_curves
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ShuffleSplit, train_test_split
def ModelLearning(X, y):
""" Calculates the performance of several models with varying sizes of training data.
The learning and testing scores for each model are then plotted. """
# Create 10 cross-validation sets for training and testing
cv = ShuffleSplit(X.shape[0], test_size=0.2, random_state=0)
# Generate the training set sizes increasing by 50
train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int)
# Create the figure window
fig = pl.figure(figsize=(10, 7))
# Create three different models based on max_depth
for k, depth in enumerate([1, 3, 6, 10]):
# Create a Decision tree regressor at max_depth = depth
regressor = DecisionTreeRegressor(max_depth=depth)
# Calculate the training and testing scores
sizes, train_scores, test_scores = curves(regressor, X, y, cv=cv, train_sizes=train_sizes, scoring='r2')
# Find the mean and standard deviation for smoothing
train_std = np.std(train_scores, axis=1)
train_mean = np.mean(train_scores, axis=1)
test_std = np.std(test_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
# Subplot the learning curve
ax = fig.add_subplot(2, 2, k + 1)
ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
ax.fill_between(sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='r')
ax.fill_between(sizes, test_mean - test_std, test_mean + test_std, alpha=0.15, color='g')
# Labels
ax.set_title('max_depth = %s' % (depth))
ax.set_xlabel('Number of Training Points')
ax.set_ylabel('Score')
ax.set_xlim([0, X.shape[0] * 0.8])
ax.set_ylim([-0.05, 1.05])
# Visual aesthetics
ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.)
fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=16, y=1.03)
fig.tight_layout()
fig.savefig('learning curve.png')
def ModelComplexity(X, y):
""" Calculates the performance of the model as model complexity increases.
The learning and testing errors rates are then plotted. """
# Create 10 cross-validation sets for training and testing
cv = ShuffleSplit(X.shape[0], test_size=0.2, random_state=0)
# Vary the max_depth parameter from 1 to 10
max_depth = np.arange(1, 11)
# Calculate the training and testing scores
train_scores, test_scores = v_curves(DecisionTreeRegressor(), X, y, param_name="max_depth", param_range=max_depth, cv=cv, scoring='r2')
# Find the mean and standard deviation for smoothing
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# Plot the validation curve
pl.figure(figsize=(7, 5))
pl.title('Decision Tree Regressor Complexity Performance')
pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score')
pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score')
pl.fill_between(max_depth, train_mean - train_std, train_mean + train_std, alpha=0.15, color='r')
pl.fill_between(max_depth, test_mean - test_std, test_mean + test_std, alpha=0.15, color='g')
# Visual aesthetics
pl.legend(loc='lower right')
pl.xlabel('Maximum Depth')
pl.ylabel('Score')
pl.ylim([-0.05, 1.05])
pl.savefig('new House complexity curve.png')
def PredictTrials(X, y, fitter, data):
""" Performs trials of fitting and predicting data. """
# Store the predicted prices
prices = []
for k in range(10):
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=k)
# Fit the data
reg = fitter(X_train, y_train)
# Make a prediction
pred = reg.predict([data[0]])[0]
prices.append(pred)
# Result
print "Trial {}: ${:,.2f}".format(k + 1, pred)
# Display price range
print "\nRange in prices: ${:,.2f}".format(max(prices) - min(prices))