Clustering
Thu 15 April 2021
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import numpy.linalg as LA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap
plt.rcParams['figure.figsize'] = [12, 8]
sns.set()
sns.set_style('ticks')
sns.set_palette("Set3")
np.set_printoptions(suppress=True)
def loaddata(filename):
# Load data set from CSV file
X = np.loadtxt(filename, delimiter=',')
return X
X = loaddata("./data/seedsDataset.txt")
In [3]:
def normalize_data(X):
# Compute mean of each feature
X_mean = np.mean(X, axis=0)
#Compute standard deviation of each feature
X_std = np.std(X, axis=0)
return (X - X_mean)/X_std
X_norm = normalize_data(X)
In [4]:
def euclidian_dist(sample1, sample2):
return np.sqrt(np.sum((sample1 - sample2)**2))
def generate_random_centroids(K, X):
rng = np.random.default_rng()
idx = rng.choice(X.shape[0], K, replace=False)
centroids = X[idx, :]
for k in range (1, K + 1):
centroids[k-1, 8] = k
return centroids
def compute_intra_cluster_dist(X, centroids):
intra_dist = []
for centroid in centroids:
for x_n in X[X[:,8] == centroid[8]]:
intra_dist.append(euclidian_dist(x_n[:8], centroid[:8]) ** 2)
return np.sum(intra_dist)
def K_means_clustering(K, X):
# Generate random centroids - choose random objects as centroids
N = X.shape[0]
# Add column of which will hold the k value for the data objects
zero_col = np.zeros((N, 1))
X = np.hstack((X, zero_col))
centroids_current = generate_random_centroids(K, X)
assingments_start = -np.ones(N)
assignments_current = assingments_start
###### step 2 #####
# Hvis alle assignments er de samme som i forrige iteration -> stop!
while(~(assignments_current == X[:,8]).all()):
k_array = []
for k in X[:,8]:
k_array.append(k)
assignments_current = np.array(k_array)
###### step 1 #####
# For hvert data object X_n find det k som minimizer D(X_n,U_k)
# Altså dem K-mean som er tættest på objectet og assign det til den K-mean: z_nk = 1 # hvis objectet er assigned til denne cluster
for x_n in X:
dist_to_centroids = []
for centroid in centroids_current:
dist_to_centroids.append([euclidian_dist(x_n[:8], centroid[:8]), centroid[-1]])
dist_to_centroids = np.array(dist_to_centroids)
x_n[8] = dist_to_centroids[np.argmin(dist_to_centroids, axis=0)][0,1]
##### step 3 ######
# Opdater alle U_k med equation: sum(z_nk*X_n) / sum(z_nk)
new_centroids = []
for k in range(1, K + 1):
if np.isnan(np.mean(X[X[:, 8] == k][:,:8], axis=0)).any():
centroid_k = centroids_current[k-1]
new_centroids.append(centroid_k)
else:
centroid_k = np.append( (np.mean(X[X[:, 8] == k][:,:8], axis=0)),k )
new_centroids.append(centroid_k)
#print(f"Data objects assigned to same cluster as last iteration: {np.sum (assignments_current == X[:,8])} ")
#print(~(assignments_current == X[:,8]).all())
centroids_current = np.array(new_centroids)
###### step 4 ######
# Vend tilbage til step 1
unique, counts = np.unique(X[:,8], return_counts=True)
cluster_counts = dict(zip(unique, counts))
intra_cluster_dist = compute_intra_cluster_dist(X, centroids_current)
return X, cluster_counts, intra_cluster_dist, centroids_current
In [5]:
# Perform the clustering 5 times and print out the results
K = 3
clustering_models = []
for num in range(0,5):
clustering_models.append(K_means_clustering(K, X_norm))
count=1
for model_results in clustering_models:
print(f"{count}. run:\n Cluster 1 = {model_results[1][1]}, Cluster 2 = {model_results[1][2]}, Cluster 3 = {model_results[1][3]}, intra cluster distance = {model_results[2]}\n")
count += 1
# As all 5 perform equally well i choose the first model and save the results in a variable
clustering_model = clustering_models[0]
In [6]:
# This function and overall solution is based on parts of the handout code for assignment 4
def pca(data):
data_norm = normalize_data(data)
N = data_norm.shape[0]
C = (1/N) * data_norm.T @ data_norm
e_vals, e_vecs = LA.eigh(C)
return e_vals, e_vecs
def transform_data(X, e_vecs, dims):
return np.dot(X, e_vecs[:, :dims])
e_vals, e_vecs = pca(X_norm)
# Sorting the eigen values and eigenvectors in descending order
i_max = (-e_vals).argsort()
e_vals = e_vals[i_max]
e_vecs = e_vecs[:,i_max]
In [7]:
# This plot and overall solution is based on parts of the handout code for assignment 4
variance_explained_per_component = e_vals/np.sum(e_vals)
cumulative_variance_explained = np.cumsum(variance_explained_per_component)
plt.plot(cumulative_variance_explained)
plt.xlabel('Number of principal components included')
plt.ylabel('Proportion of variance explained')
plt.title('Proportion of variance explained as a function of number of PCs included')
# Let's print out the proportion of variance explained by the first 8 PCs
for i in range(7):
print('Proportion of variance explained by the first '+str(i+1)+' principal components:', cumulative_variance_explained[i])
In [8]:
# Transforming original data based on PCA
X_pca = transform_data(X_norm, e_vecs, 3)
print('Shape of the transformed data =', X_pca.shape)
In [9]:
# Performing dimension reduction on the features of the clustering model data
features = clustering_model[0][:,:8]
labels = clustering_model[0][:,8]
centroids = clustering_model[3]
labels_centroids = centroids[:,8]
features_centroids = centroids[:,:8]
e_vals, e_vecs = pca(features)
i_max = (-e_vals).argsort()
e_vals = e_vals[i_max]
e_vecs = e_vecs[:,i_max]
features_reduced = transform_data(features, e_vecs, 2)
features_centroids_reduced = transform_data(features_centroids, e_vecs, 2)
In [10]:
# Visualization function is based on handout code for assignment 5
def visualize_clusters(features, labels, centroid_features, centroid_labels):
cmap_light = ListedColormap([ sns.color_palette("Set3")[3], sns.color_palette("Set3")[4], sns.color_palette("Set3")[0] ])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
y = labels
scatter = plt.scatter(features[:, 0], features[:, 1], c = y, cmap=cmap_light)
plt.xlim(features[:, 0].min() - 0.1, features[:, 0].max() + 0.1)
plt.ylim(features[:, 1].min() - 0.1, features[:, 1].max() + 0.1)
legend = plt.legend(*scatter.legend_elements(), loc="lower left", title="Clusters")
scatter_2 = plt.scatter(centroid_features[:, 0], centroid_features[:, 1], c = "black", s=200)
plt.title('Plot of the 3 clusters and their respective centroids (the black dots)')
plt.show()
visualize_clusters(features_reduced, labels, features_centroids_reduced, labels_centroids)