import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from functools import partial
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, r2_score
from scipy.stats import mode
Personal notes for myself about K-nearest neighbor and how to implement it from scratch.
1. Introduction
K-Nearest Neighbouts is a supervised learning algorithm that works based on distance metrics. A new point is classified as the \(k\) closest neighbours in the space to this new point.
1.1. Algorithm
Knn algorithm is probably one of the simplest machine learning models yet it still yields effective results. The idea is to represent the labelled points in the space and classify or regress a new unlabelled point by looking at the labels of the \(K\) closests neighbours.
Let’s visualize an example. Below, we observe a dataset composed of 10 points. Each point has a label associated. We want to classificy the point in red. Which label do we assign to it?
def plot_knn(k_val=None):
3)
np.random.seed(= np.random.normal(0, 0.05, size=(10, 2))
X = np.random.choice([0, 1], size=len(X))
y = np.random.normal(0, 0.05, size=2)
new_point
# Compute Euclidean distance from new_point to the rest of the points
= partial(euclidean, v=new_point)
euclidean_from_new_point = np.apply_along_axis(euclidean_from_new_point, axis=1, arr=X)
distances = np.argsort(distances)
dist_idx_sort
# Plot
= plt.subplots(1, 1, figsize=(12, 6))
fig, ax
0], X[:, 1], c=y, s=300)
ax.scatter(X[:, 0], new_point[1], c="red", marker="*", s=300)
ax.scatter(new_point[
if k_val is not None:
# Sort X based on distance and select the closest k_val points
= X[dist_idx_sort][:k_val]
closest_points = distances[dist_idx_sort][:k_val]
closest_distances
# Plot arrows to the closest k_val points
for i in range(closest_points.shape[0]):
= closest_points[i]
x_target, y_target = closest_distances[i]
distance
# Draw arrow from new_point to the closest point
ax.annotate('',
=(x_target, y_target),
xy=(new_point[0], new_point[1]),
xytext=dict(
arrowprops='blue',
facecolor='blue',
edgecolor='->',
arrowstyle=2,
lw
),
)
# Calculate the midpoint for placing the text
= (new_point + closest_points[i]) / 2
mid_point
ax.text(0],
mid_point[1],
mid_point[f'{distance:.2f}',
='blue',
color=12,
fontsize='center',
ha='center'
va
)
True, alpha=.3)
ax.grid(
if k_val is None:
"Dataset", fontsize=24)
ax.set_title(else:
f"Dataset | k: {k_val}", fontsize=24)
ax.set_title("Feature $x_{1}$", fontsize=20)
ax.set_xlabel("Feature $x_{2}$", fontsize=20)
ax.set_ylabel(=20) ax.tick_params(labelsize
plot_knn()
Knn uses a distance metric
, like Euclidean distance, to determine the closest neighbours and takes the mode value of its labels to decide which class to assign to the new point. If we were working with continuous values we would assign the average or median value of the labelled datapoints to the new point.
Below I show the plots for the K-nearest neightbours for different values of \(K\).
=3)
plot_knn(k_val=7) plot_knn(k_val
Notice that the same point could be assigned to different classes depending on the \(K\) value. In addition, it is better to select an odd value for \(K\) to avoid ties.
2. Coding a Knn from scratch
We code the Knn following Scikit-Learn API for machine learning algorithms. The distance
metric can be passed as a callable
class KNNModel:
def __init__(self, is_classification, k_neighbours=3, distance=euclidean):
self.is_classification = is_classification
self.k_neighbours = k_neighbours
self.distance = distance
# attributes
self.X_train = None
self.y_train = None
def fit(self, X, y):
self.X_train = X
self.y_train = y
def predict(self, X):
= self.k_neighbours
k_neighbours = self.is_classification
is_classification = self.distance
distance = self.y_train
y_train = self.X_train
X_train
= X.shape[0]
n_samples = np.empty(shape=(n_samples, 1))
predictions for i, x_sample in enumerate(X):
= partial(distance, v=x_sample)
distance_from_new_point = np.apply_along_axis(
distances =1, arr=X_train
distance_from_new_point, axis
)= np.argsort(distances)
dist_idx_sort
= y_train[dist_idx_sort][:k_neighbours]
k_closest_labels if is_classification:
= mode(k_closest_labels).mode
predictions[i] else:
= np.mean(k_closest_labels)
predictions[i]
return predictions
2.1. Classification
We test our model using a classification dataset.
= make_classification(n_samples=500, n_features=2, n_informative=2, n_redundant=0, n_classes=2, random_state=10)
X, y
= plt.subplots(1, 1, figsize=(12, 6))
fig, ax
0], X[:, 1], c=y)
ax.scatter(X[:, True, alpha=.3)
ax.grid("Dataset", fontsize=24)
ax.set_title("Feature $x_{1}$", fontsize=20)
ax.set_xlabel("Feature $x_{2}$", fontsize=20)
ax.set_ylabel(=20) ax.tick_params(labelsize
= KFold(n_splits=5)
kf
= []
roc_scores for train_idx, test_idx in kf.split(X):
= KNNModel(is_classification=True, distance=euclidean)
model
= X[train_idx]
X_train = y[train_idx]
y_train
= X[test_idx]
X_test = y[test_idx]
y_test
model.fit(X_train, y_train)= model.predict(X_test)
y_pred
= roc_auc_score(y_true=y_test, y_score=y_pred)
score roc_scores.append(score)
We find this naive implementation works pretty well for the generated dataset, with an average of 0.95 ROC AUC score within 5 k-fold framework.
np.array(roc_scores).mean()
0.9520628722985057
2.2. Regression
What about regression. We perform the same process but using a regression dataset and algorithm
= make_regression(n_samples=500, n_features=1, n_informative=1, n_targets=1, noise=10, random_state=0)
X, y
= plt.subplots(1, 1, figsize=(12, 6))
fig, ax
ax.scatter(X, y)True, alpha=.3)
ax.grid("Dataset", fontsize=24)
ax.set_title("Feature $x_{1}$", fontsize=20)
ax.set_xlabel("Label", fontsize=20)
ax.set_ylabel(=20) ax.tick_params(labelsize
= KFold(n_splits=5)
kf
= []
r2_scores for train_idx, test_idx in kf.split(X):
= KNNModel(is_classification=False, distance=euclidean)
model
= X[train_idx]
X_train = y[train_idx]
y_train
= X[test_idx]
X_test = y[test_idx]
y_test
model.fit(X_train, y_train)= model.predict(X_test)
y_pred
= r2_score(y_true=y_test, y_pred=y_pred)
score r2_scores.append(score)
= plt.subplots(1, 1, figsize=(12, 6))
fig, ax
ax.scatter(X_test, y_test)
ax.scatter(X_test, y_pred)True, alpha=.3)
ax.grid(
"Test data", "Predictions"], fontsize=20)
ax.legend(["Dataset", fontsize=24)
ax.set_title("Feature $x_{1}$", fontsize=20)
ax.set_xlabel("Label", fontsize=20)
ax.set_ylabel(=20) ax.tick_params(labelsize
Again, this naive model yields pretty good results on the synthetic regression dataset, with an average r2 score of 0.93
np.array(r2_scores).mean()
0.9344901211500038
3. Strengths and Weaknesses
A summary of strenghts and weaknesses of this model.
Pros
- Simple to understand and implement.
- No assumptions about data distribution.
- Effective in small, well-separated datasets.
- Adapts easily to new data (lazy learner).
Cons
- Computationally expensive during prediction, especially with large datasets. \(O(n \cdot d \cdot k)\), where \(n\) is the number of samples, \(d\) is the number of dimensions, and \(k\) is the number of neighbors.
- Sensitive to irrelevant features and noise.
- Requires feature scaling for optimal performance.
- Struggles with high-dimensional data (curse of dimensionality).