Clustering

Thu 15 April 2021

In [2]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import numpy.linalg as LA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap


plt.rcParams['figure.figsize'] = [12, 8]
sns.set()
sns.set_style('ticks')
sns.set_palette("Set3")
np.set_printoptions(suppress=True)


def loaddata(filename):
    # Load data set from CSV file
    X = np.loadtxt(filename, delimiter=',')
    
    return X

X = loaddata("./data/seedsDataset.txt")

In [3]:

def normalize_data(X):
    # Compute mean of each feature
    X_mean = np.mean(X, axis=0)
    #Compute standard deviation of each feature
    X_std = np.std(X, axis=0)

    return (X - X_mean)/X_std

X_norm = normalize_data(X)

In [4]:

def euclidian_dist(sample1, sample2):
    return np.sqrt(np.sum((sample1 - sample2)**2))

def generate_random_centroids(K, X):
    rng = np.random.default_rng()
    idx = rng.choice(X.shape[0], K, replace=False)
    
    centroids = X[idx, :]
    for k in range (1, K + 1):
        centroids[k-1, 8] = k
    return centroids

def compute_intra_cluster_dist(X, centroids):
    intra_dist = []
    for centroid in centroids:
        for x_n in X[X[:,8] == centroid[8]]:
            intra_dist.append(euclidian_dist(x_n[:8], centroid[:8]) ** 2)
    return np.sum(intra_dist)

def K_means_clustering(K, X):
    # Generate random centroids - choose random objects as centroids
    N = X.shape[0]
    # Add column of which will hold the k value for the data objects
    zero_col = np.zeros((N, 1))
    X = np.hstack((X, zero_col))

    centroids_current = generate_random_centroids(K, X)
    
    assingments_start = -np.ones(N)
    assignments_current = assingments_start

    ###### step 2 #####
    # Hvis alle assignments er de samme som i forrige iteration -> stop! 
    while(~(assignments_current == X[:,8]).all()):
        k_array = []
        for k in X[:,8]:
            k_array.append(k)
        
        assignments_current = np.array(k_array)
        
        ###### step 1 #####
        # For hvert data object X_n find det k som minimizer D(X_n,U_k)
        # Altså dem K-mean som er tættest på objectet og assign det til den K-mean: z_nk = 1   # hvis objectet er assigned til denne cluster  
        for x_n in X:
            dist_to_centroids = []
            for centroid in centroids_current:
                dist_to_centroids.append([euclidian_dist(x_n[:8], centroid[:8]), centroid[-1]])
            dist_to_centroids = np.array(dist_to_centroids)
            
            x_n[8] = dist_to_centroids[np.argmin(dist_to_centroids, axis=0)][0,1]
        
        ##### step 3 ######
        # Opdater alle U_k med equation: sum(z_nk*X_n) / sum(z_nk)
        new_centroids = []
        for k in range(1, K + 1):
            if np.isnan(np.mean(X[X[:, 8] == k][:,:8], axis=0)).any():
                centroid_k = centroids_current[k-1]
                new_centroids.append(centroid_k)
            else: 
                centroid_k = np.append( (np.mean(X[X[:, 8] == k][:,:8], axis=0)),k )
                new_centroids.append(centroid_k)
        
        #print(f"Data objects assigned to same cluster as last iteration:  {np.sum (assignments_current == X[:,8])} ")
        #print(~(assignments_current == X[:,8]).all())
        centroids_current = np.array(new_centroids)
    
        ###### step 4 ######
        # Vend tilbage til step 1

    unique, counts = np.unique(X[:,8], return_counts=True) 
    cluster_counts = dict(zip(unique, counts))

    intra_cluster_dist = compute_intra_cluster_dist(X, centroids_current)

    return X, cluster_counts, intra_cluster_dist, centroids_current

In [5]:

# Perform the clustering 5 times and print out the results
K = 3

clustering_models = []
for num in range(0,5):
    clustering_models.append(K_means_clustering(K, X_norm))

count=1

for model_results in clustering_models:
    print(f"{count}. run:\n Cluster 1 = {model_results[1][1]}, Cluster 2 = {model_results[1][2]}, Cluster 3 = {model_results[1][3]}, intra cluster distance = {model_results[2]}\n")
    count += 1

# As all 5 perform equally well i choose the first model and save the results in a variable
clustering_model = clustering_models[0]

1. run:
 Cluster 1 = 70, Cluster 2 = 70, Cluster 3 = 70, intra cluster distance = 456.7120805828978

2. run:
 Cluster 1 = 70, Cluster 2 = 70, Cluster 3 = 70, intra cluster distance = 456.7120805828978

3. run:
 Cluster 1 = 70, Cluster 2 = 70, Cluster 3 = 70, intra cluster distance = 456.7120805828977

4. run:
 Cluster 1 = 70, Cluster 2 = 70, Cluster 3 = 70, intra cluster distance = 456.7120805828978

5. run:
 Cluster 1 = 70, Cluster 2 = 70, Cluster 3 = 70, intra cluster distance = 456.7120805828978

In [6]:

# This function and overall solution is based on parts of the handout code for assignment 4
def pca(data):
    data_norm = normalize_data(data)
    N = data_norm.shape[0]
    C = (1/N) * data_norm.T @ data_norm

    e_vals, e_vecs = LA.eigh(C)
    return e_vals, e_vecs

def transform_data(X, e_vecs, dims):
    return np.dot(X,  e_vecs[:, :dims])    

e_vals, e_vecs = pca(X_norm)
# Sorting the eigen values and eigenvectors in descending order
i_max = (-e_vals).argsort()
e_vals = e_vals[i_max]
e_vecs = e_vecs[:,i_max]

In [7]:

# This plot and overall solution is based on parts of the handout code for assignment 4
variance_explained_per_component = e_vals/np.sum(e_vals)
cumulative_variance_explained = np.cumsum(variance_explained_per_component)

plt.plot(cumulative_variance_explained)
plt.xlabel('Number of principal components included')
plt.ylabel('Proportion of variance explained')
plt.title('Proportion of variance explained as a function of number of PCs included')

# Let's print out the proportion of variance explained by the first 8 PCs
for i in range(7):
    print('Proportion of variance explained by the first '+str(i+1)+' principal components:', cumulative_variance_explained[i])

Proportion of variance explained by the first 1 principal components: 0.6483920080248549
Proportion of variance explained by the first 2 principal components: 0.8606270751508511
Proportion of variance explained by the first 3 principal components: 0.9455085282120836
Proportion of variance explained by the first 4 principal components: 0.9916886499517641
Proportion of variance explained by the first 5 principal components: 0.9973304981977813
Proportion of variance explained by the first 6 principal components: 0.999247367459577
Proportion of variance explained by the first 7 principal components: 0.9999137236853287

In [8]:

# Transforming original data based on PCA
X_pca = transform_data(X_norm, e_vecs, 3)
print('Shape of the transformed data =', X_pca.shape)

Shape of the transformed data = (210, 3)

In [9]:

# Performing dimension reduction on the features of the clustering model data
features = clustering_model[0][:,:8]
labels = clustering_model[0][:,8]

centroids = clustering_model[3]
labels_centroids = centroids[:,8]
features_centroids = centroids[:,:8]

e_vals, e_vecs = pca(features)
i_max = (-e_vals).argsort()
e_vals = e_vals[i_max]
e_vecs = e_vecs[:,i_max]

features_reduced = transform_data(features, e_vecs, 2)
features_centroids_reduced = transform_data(features_centroids, e_vecs, 2)

In [10]:

# Visualization function is based on handout code for assignment 5
def visualize_clusters(features, labels, centroid_features, centroid_labels):
    cmap_light = ListedColormap([ sns.color_palette("Set3")[3], sns.color_palette("Set3")[4], sns.color_palette("Set3")[0] ])
    cmap_bold  = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
    y = labels

    scatter = plt.scatter(features[:, 0], features[:, 1], c = y, cmap=cmap_light)

    plt.xlim(features[:, 0].min() - 0.1, features[:, 0].max() + 0.1)
    plt.ylim(features[:, 1].min() - 0.1, features[:, 1].max() + 0.1)
    
    legend = plt.legend(*scatter.legend_elements(), loc="lower left", title="Clusters")
    
    scatter_2 = plt.scatter(centroid_features[:, 0], centroid_features[:, 1], c = "black", s=200)
    
    plt.title('Plot of the 3 clusters and their respective centroids (the black dots)')
    plt.show()

visualize_clusters(features_reduced, labels, features_centroids_reduced, labels_centroids)