Classification

Thu 15 April 2021

In [27]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap


plt.rcParams['figure.figsize'] = [12, 8]
sns.set()
sns.set_style('ticks')
sns.set_palette("Set3")
np.set_printoptions(suppress=True)


def loaddata(filename):
    # Load data set from CSV file
    Xt = np.loadtxt(filename, delimiter=',')

    # Split into data matrix and target vector
    t = Xt[:,0].reshape(-1,1)
    X = Xt[:,1:]
    
    return t, X
In [28]:
labels_train, X_train = loaddata("./data/accent-mfcc-data_shuffled_train.txt")
labels_validation, X_validation = loaddata("./data/accent-mfcc-data_shuffled_validation.txt")
In [29]:
def __visualizeLabels(features, referenceLabels):
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF', '#FF0000', '#00FF00', '#0000FF'])
    cmap_bold  = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
    y = referenceLabels

    scatter = plt.scatter(features[:, 0], features[:, 1], c = y+1, cmap=cmap_light)

    plt.xlim(features[:, 0].min() - 0.1, features[:, 0].max() + 0.1)
    plt.ylim(features[:, 1].min() - 0.1, features[:, 1].max() + 0.1)
    
    legend = plt.legend(*scatter.legend_elements(), loc="lower left", title="Classes")

    plt.show()

    
__visualizeLabels(X_train, labels_train)
In [8]:
def __randomForests(training_features, training_labels, **kwargs):
    classifier = RandomForestClassifier(**kwargs)
    predictor = classifier.fit(training_features, training_labels).predict
    prob_predictor = classifier.fit(training_features, training_labels).predict_proba
    return predictor, prob_predictor

# Function to compute metrics of the model
def compute_metrics(labels_prob_preds, labels_validation):
    # Compute number of correct predictions
    num_correct_preds = np.sum(labels_validation == labels_preds)
    
    # Compute average probability assigned to the correct class
    res = []
    for prob_array, true_label in zip(labels_prob_preds, labels_validation):
        res.append(prob_array[true_label.astype(int)][0])
    res = np.array(res)
    avg_prob_correct_class = np.mean(res)

    return num_correct_preds, avg_prob_correct_class    
In [24]:
# Implementation and predictions for an instance of the model with no set parameters
labels_train = labels_train.ravel()

predictor, prob_predictor = __randomForests(X_train, labels_train)
labels_preds = predictor(X_validation).reshape(-1,1)
labels_prob_preds = prob_predictor(X_validation)

num_correct_preds, avg_prob_correct_class = compute_metrics(labels_prob_preds, labels_validation)

print(f'Number of correct predicitons for model = {num_correct_preds}\nAverage probability assigned to correct class = {avg_prob_correct_class}')
Number of correct predicitons for model = 64
Average probability assigned to correct class = 0.5655844155844154
In [25]:
import itertools
# Create list of dictionaries containing all possible parameter combinations
parameter_options = {
               'criterion': ['entropy', 'gini'],
               'max_depth': [2,5,7,10,15],
               'max_features': ['sqrt', 'log2'],
               }
keys = parameter_options.keys()
values = (parameter_options[key] for key in keys)
parameter_combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
In [26]:
# Loop to test different combinations of parameter values of the random forest classifier
parameter_option_start = parameter_combinations[0]

predictor, prob_predictor = __randomForests(X_train, labels_train, criterion=parameter_option_start['criterion'], max_depth = parameter_option_start['max_depth'], max_features=parameter_option_start['max_features'])
predictor(X_validation)

labels_preds = predictor(X_validation).reshape(-1,1)
labels_prob_preds = prob_predictor(X_validation)

metric1_start, metric2_start = compute_metrics(labels_prob_preds, labels_validation) 

parameter_option_current = parameter_option_start
metric1_current, metric2_current = metric1_start, metric2_start

print(f"Starting point model parameters:\ncriterion = {parameter_option_start['criterion']} ; max depth = {parameter_option_start['max_depth']} ; max features = {parameter_option_start['max_features']} ; average probability = {metric2_start} ; num. of correctly classified samples = {metric1_start}\n")

for parameter_option_new in parameter_combinations:
    predictor, prob_predictor = __randomForests(X_train, labels_train, 
    criterion=parameter_option_new['criterion'], 
    max_depth = parameter_option_new['max_depth'], 
    max_features=parameter_option_new['max_features'])

    labels_preds = predictor(X_validation).reshape(-1,1)
    labels_prob_preds = prob_predictor(X_validation)

    metric1_new, metric2_new = compute_metrics(labels_prob_preds, labels_validation)

    if metric1_new == metric1_current:
        if metric2_new > metric2_current:
            print(f"New optimal model parameters by metric 2:\ncriterion = {parameter_option_new['criterion']} ; max depth = {parameter_option_new['max_depth']} ; max features = {parameter_option_new['max_features']} ; average probability = {metric2_new} ; num. of correctly classified samples = {metric1_new}\n")
            metric1_current, metric2_current = metric1_new, metric2_new
            parameter_option_current = parameter_option_new
    elif metric1_new > metric1_current:
        print(f"New optimal model parameters by metric 1:\ncriterion = {parameter_option_new['criterion']} ; max depth = {parameter_option_new['max_depth']} ; max features = {parameter_option_new['max_features']} ; average probability = {metric2_new} ; num. of correctly classified samples = {metric1_new}\n")
        metric1_current, metric2_current = metric1_new, metric2_new
        parameter_option_current = parameter_option_new
Starting point model parameters:
criterion = entropy ; max depth = 2 ; max features = sqrt ; average probability = 0.37216387852727034 ; num. of correctly classified samples = 43

New optimal model parameters by metric 2:
criterion = entropy ; max depth = 2 ; max features = sqrt ; average probability = 0.37329409641759564 ; num. of correctly classified samples = 43

New optimal model parameters by metric 1:
criterion = entropy ; max depth = 5 ; max features = sqrt ; average probability = 0.5101209397946193 ; num. of correctly classified samples = 56

New optimal model parameters by metric 1:
criterion = entropy ; max depth = 5 ; max features = log2 ; average probability = 0.513440442748445 ; num. of correctly classified samples = 58

New optimal model parameters by metric 2:
criterion = entropy ; max depth = 7 ; max features = sqrt ; average probability = 0.5466936800211694 ; num. of correctly classified samples = 58

New optimal model parameters by metric 1:
criterion = entropy ; max depth = 7 ; max features = log2 ; average probability = 0.5619980412898191 ; num. of correctly classified samples = 60

New optimal model parameters by metric 1:
criterion = entropy ; max depth = 10 ; max features = sqrt ; average probability = 0.5664492948270642 ; num. of correctly classified samples = 61

New optimal model parameters by metric 2:
criterion = entropy ; max depth = 15 ; max features = sqrt ; average probability = 0.5754545454545454 ; num. of correctly classified samples = 61

New optimal model parameters by metric 1:
criterion = entropy ; max depth = 15 ; max features = log2 ; average probability = 0.5742857142857143 ; num. of correctly classified samples = 63