Classification
Thu 15 April 2021
In [27]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap
plt.rcParams['figure.figsize'] = [12, 8]
sns.set()
sns.set_style('ticks')
sns.set_palette("Set3")
np.set_printoptions(suppress=True)
def loaddata(filename):
# Load data set from CSV file
Xt = np.loadtxt(filename, delimiter=',')
# Split into data matrix and target vector
t = Xt[:,0].reshape(-1,1)
X = Xt[:,1:]
return t, X
In [28]:
labels_train, X_train = loaddata("./data/accent-mfcc-data_shuffled_train.txt")
labels_validation, X_validation = loaddata("./data/accent-mfcc-data_shuffled_validation.txt")
In [29]:
def __visualizeLabels(features, referenceLabels):
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF', '#FF0000', '#00FF00', '#0000FF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
y = referenceLabels
scatter = plt.scatter(features[:, 0], features[:, 1], c = y+1, cmap=cmap_light)
plt.xlim(features[:, 0].min() - 0.1, features[:, 0].max() + 0.1)
plt.ylim(features[:, 1].min() - 0.1, features[:, 1].max() + 0.1)
legend = plt.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
plt.show()
__visualizeLabels(X_train, labels_train)
In [8]:
def __randomForests(training_features, training_labels, **kwargs):
classifier = RandomForestClassifier(**kwargs)
predictor = classifier.fit(training_features, training_labels).predict
prob_predictor = classifier.fit(training_features, training_labels).predict_proba
return predictor, prob_predictor
# Function to compute metrics of the model
def compute_metrics(labels_prob_preds, labels_validation):
# Compute number of correct predictions
num_correct_preds = np.sum(labels_validation == labels_preds)
# Compute average probability assigned to the correct class
res = []
for prob_array, true_label in zip(labels_prob_preds, labels_validation):
res.append(prob_array[true_label.astype(int)][0])
res = np.array(res)
avg_prob_correct_class = np.mean(res)
return num_correct_preds, avg_prob_correct_class
In [24]:
# Implementation and predictions for an instance of the model with no set parameters
labels_train = labels_train.ravel()
predictor, prob_predictor = __randomForests(X_train, labels_train)
labels_preds = predictor(X_validation).reshape(-1,1)
labels_prob_preds = prob_predictor(X_validation)
num_correct_preds, avg_prob_correct_class = compute_metrics(labels_prob_preds, labels_validation)
print(f'Number of correct predicitons for model = {num_correct_preds}\nAverage probability assigned to correct class = {avg_prob_correct_class}')
In [25]:
import itertools
# Create list of dictionaries containing all possible parameter combinations
parameter_options = {
'criterion': ['entropy', 'gini'],
'max_depth': [2,5,7,10,15],
'max_features': ['sqrt', 'log2'],
}
keys = parameter_options.keys()
values = (parameter_options[key] for key in keys)
parameter_combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
In [26]:
# Loop to test different combinations of parameter values of the random forest classifier
parameter_option_start = parameter_combinations[0]
predictor, prob_predictor = __randomForests(X_train, labels_train, criterion=parameter_option_start['criterion'], max_depth = parameter_option_start['max_depth'], max_features=parameter_option_start['max_features'])
predictor(X_validation)
labels_preds = predictor(X_validation).reshape(-1,1)
labels_prob_preds = prob_predictor(X_validation)
metric1_start, metric2_start = compute_metrics(labels_prob_preds, labels_validation)
parameter_option_current = parameter_option_start
metric1_current, metric2_current = metric1_start, metric2_start
print(f"Starting point model parameters:\ncriterion = {parameter_option_start['criterion']} ; max depth = {parameter_option_start['max_depth']} ; max features = {parameter_option_start['max_features']} ; average probability = {metric2_start} ; num. of correctly classified samples = {metric1_start}\n")
for parameter_option_new in parameter_combinations:
predictor, prob_predictor = __randomForests(X_train, labels_train,
criterion=parameter_option_new['criterion'],
max_depth = parameter_option_new['max_depth'],
max_features=parameter_option_new['max_features'])
labels_preds = predictor(X_validation).reshape(-1,1)
labels_prob_preds = prob_predictor(X_validation)
metric1_new, metric2_new = compute_metrics(labels_prob_preds, labels_validation)
if metric1_new == metric1_current:
if metric2_new > metric2_current:
print(f"New optimal model parameters by metric 2:\ncriterion = {parameter_option_new['criterion']} ; max depth = {parameter_option_new['max_depth']} ; max features = {parameter_option_new['max_features']} ; average probability = {metric2_new} ; num. of correctly classified samples = {metric1_new}\n")
metric1_current, metric2_current = metric1_new, metric2_new
parameter_option_current = parameter_option_new
elif metric1_new > metric1_current:
print(f"New optimal model parameters by metric 1:\ncriterion = {parameter_option_new['criterion']} ; max depth = {parameter_option_new['max_depth']} ; max features = {parameter_option_new['max_features']} ; average probability = {metric2_new} ; num. of correctly classified samples = {metric1_new}\n")
metric1_current, metric2_current = metric1_new, metric2_new
parameter_option_current = parameter_option_new