-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn.py
271 lines (223 loc) · 9.04 KB
/
knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
from collections import Counter
import random
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import matplotlib.colors as mplc
import mplcyberpunk
color_list = [k for k,v in mplc.cnames.items()]
# plt.style.use("bmh")
plt.style.use("cyberpunk")
NUM_CLUSTERS=2
np.random.seed(1001)
random_state=11
def normalize(X):
# min-max normalizer
return (X - X.min())/(X.max()-X.min())
def calc_distance(x1, x2):
# euclidean distance measure desc
return np.sqrt(np.sum(np.square(x1-x2), axis=1))
def find_k_nearest_neightbours(X, y, input_data, k):
# given the dataset points and the unknown label points
# and the neighbourhood to consider find the
# k-nearest neighbours of the unknown point
distance = calc_distance(X, input_data)
indices = np.argsort(distance)[:k]
return X[indices], y[indices]
def plot_data_points(X,y, input_data):
# plot the data points and the unknown data
fig, ax = plt.subplots(1,1,
constrained_layout=True,
figsize=(5,5))
class1_data, class1_labels = X[y==1], y[y==1]
class2_data, class2_labels = X[y==0], y[y==0]
ax.scatter(class1_data[:,0],
class1_data[:,1],
color="darkred",
label="class1",
s=15,
alpha=0.7)
ax.scatter(class2_data[:,0],
class2_data[:,1],
color="lightblue",
label="class2",
s=15,
alpha=0.7)
ax.scatter(input_data[0],
input_data[1],
color="cyan",
label="unknown",
alpha=0.7,
s=40)
ax.legend(loc="best")
ax.set_ylim((-0.1, 1.1))
ax.set_xlim((-0.1, 1.1))
ax.set_title("Distribution of samples")
return fig
def plot_data_points_with_labels(X, y, X_nearest, y_nearest, input_data, label):
fig, ax = plt.subplots(1,1,
constrained_layout=True,
figsize=(5,5))
# select colors for k neigbours
colors = random.sample(color_list, len(X_nearest))
class1_data, class1_labels = X[y==1], y[y==1]
class2_data, class2_labels = X[y==0], y[y==0]
ax.scatter(class1_data[:,0],
class1_data[:,1],
color="darkred",
label="class1",
s=15,
alpha=0.7)
ax.scatter(class2_data[:,0],
class2_data[:,1],
color="lightblue",
label="class1",
s=15,
alpha=0.7)
ax.scatter(input_data[0],
input_data[1],
color="cyan",
# label="unknown",
s = 40,
alpha=0.7)
plt.text(0.7, 0.05,
"Predicted Label: {}".format(label),
fontsize=15)
i = 0
for x,y in zip(X_nearest, y_nearest):
ax.plot([x[0], input_data[0]],
[x[1], input_data[1]],
colors[i],
label="neighbour {}".format(i+1))
i+=1
ax.legend(loc="best")
ax.set_ylim((-0.1, 1.1))
ax.set_xlim((-0.1, 1.1))
ax.set_title("Nearest Neighbour Prediction")
return fig
def get_labels(X, y, input_data, k):
X_nearest, y_nearest = find_k_nearest_neightbours(X, y, input_data, k)
label = Counter(y_nearest).most_common()[0][0]
return label, X_nearest, y_nearest
@st.cache
def get_data(num_points=100):
# generate 2-d synthetic data with 2 cluster centers
X, y = make_blobs(n_samples = num_points,
n_features = 2,
centers = NUM_CLUSTERS,
cluster_std=4.1,
random_state=random_state)
X = normalize(X)
return X,y
def introduction():
st.subheader("Introduction")
markdown="""
The Nearest Neighbours (NN) algorithms are among the "simplest" of the supervised algorithms for the **task** of **Classification** (mostly, but regression is also possible). Nearest Neighbour is a special case of k-Nearest Neighbours (k-NN) algorithm where $k=1$.
#### Characteristics of kNN algorithm
- **Lazy** : In **k-NN** all the labelled-training instances are simply stored and no explicit learning takes place. Hence, k-NN algorithm is called **lazy** learning algorithm.
During testing, the test-set instances are compared with the nearby training instances and the labels are consequently decided.
(The idea of what **near** means is not quite clear at this point, but it will be clarified soon.)
- **Local** : The basic principle of kNN model is that instead of learning the approximate mapping function $f(x)=y$ globally from the training data, kNN approximates a function **locally**, which depends on the testing data point as well as the training instances.
- **Instance Based**: Since the prediction of the class of the test data depends on the comparison of a query of the data with the training set, kNN is called a **instance-based** learning.
- **Non-parametric**: The kNN algorithm does not in any form make any assumptions about the training data, or the mapping function, hence, there are no parameters involved to learn.
- Commonly used as a **discriminative** model.
---
#### Prediction Algorithm for 1-Nearest Neighbour
---
closest_point := None
closest_distance := $\inf$
for $i$= 1, $\ldots$, n:
 - current_distance := $d(x^{[i]}, x^{[q]})$
 - if current_distance < closest_distance:
   * closest_distance := current_distance
   * target_value_of_closest_point := $y^{[i]}$
---
The prediction for 1-Nearest Neighbour algorithm is the target value of the closest point.
The closest point by default is measured using **Euclidean Distance** (also called $L^2$ distance), that computes the distance between two points, $x^{[a]}$ and $x^{[b]}$ given by the formula:
            $d(x^{[a]}, x^{[b]})$ = $\sqrt{\sum_{j=1}^n (x^{[a]}_j- x^{[b]}_j)^2}$
In k-Nearest Neighbour there are two prediction algorithms:
- Plurality voting among the k nearest neighbors for classification, i.e. the **mode** of the target labels of the k-Nearest Neighbours is chosen to be the label of the target value.
- Averaging the continuous target variables of the $k$ nearest neighbors for regression. i.e. take the **mean** of the target values of the k-Nearest Neighbours.
#### Advantages:
- Simple algorithm and easy to compute
- Powerful predictive algorithm as it does not approximate the global mapping from training features to the target space.
#### Disadvantages:
- Suffers from the **curse of dimensionality**, which means that as the number of features of the data increases, so does the complexity in prediction.
- If you are familiar with time-complexity of algorithms, k-NN in its naive version has the time-complexity of $O(nm)$ where $n$ stands for the number of training instances, whereas $m$ stands for the number of features of each data-point in the training set.
But there are other implements that can be used to reduce the complexity (not discussed here).
---
#### Prediction Algorithm for Naive k-NN
---
$\mathrm{D}_k := \{\}$
while $|\mathrm{D}_k| < k$:
 - closest_distance := $\inf$
 - if current_distance < closest_distance:
   - closest_distance := current_distance
   - closest_point := $x^{[i]}$
 - add closest_point to $\mathrm{D}_k$
---
"""
st.markdown(markdown)
def max_width_(prcnt_width:int = 75):
max_width_str = f"max-width: {prcnt_width}%;"
st.markdown(f"""
<style>
.reportview-container .main .block-container{{{max_width_str}}}
</style>
""",
unsafe_allow_html=True,
)
def app():
max_width_(80)
st.title("k-Nearest Neighbours Algorithm")
st.markdown("---")
introduction()
st.subheader("Check the Nearest Neighbours algorithm for youself")
st.markdown("")
markdown="""
In the following dynamic example, one can experimentally observe the effect of choosing the value of $k$ in the k-Nearest Neighbour. Here, the dataset is generated is two-dimensional as it is for experimental purpose.
So, one can change the size of the training data, the feature values termed as _feature 1_ and _feature 2_ and the number of neighbours $k$ to be considered in prediction.
"""
st.markdown(markdown)
st.markdown("")
st.markdown("")
col1, col2, col3, col4 = st.columns(4)
with col1:
num_points = st.slider("Choose size of training dataset",
min_value = 10,
max_value = 500,
key="num_points")
with col2:
x_ = st.slider("Choose input feature 1",
min_value = 0.0,
max_value = 1.0,
key="x")
with col3:
y_ = st.slider("Choose input feature 2",
min_value = 0.0,
max_value = 1.0,
key="y")
with col4:
k = st.slider("Choose the number of neighbours.",
min_value=1,
max_value=9,
key="k")
input_data = np.array([x_, y_])
X, y = get_data(num_points)
st.subheader("Visualize the Dataset and neighbours.")
col1, col2 = st.columns(2)
with col1:
st.pyplot(plot_data_points(X, y, input_data))
with col2:
label, X_nearest, y_nearest = get_labels(X, y, input_data, k)
# print(X_nearest.shape)
st.pyplot(plot_data_points_with_labels(X, y, X_nearest, y_nearest, input_data, label))
st.header("References")
references="""
- Sebastian Raschka's notes on [kNN](https://github.com/rasbt/stat451-machine-learning-fs20/raw/master/L02/02-knn__notes.pdf) from his course
[STAT 451](https://sebastianraschka.com/teaching/stat451-fs2020/) - Introduction to Machine Learning and Statistical Pattern Classification 2020
"""
st.markdown(references)
if __name__=="__main__":
app()