import random
import numpy as np
from collections import deque


def kmeans(data: list, n_clusters: int, epsilon: float):
    """
    5. A function that implements the k-means clustering algorithm.
    The function should take a data matrix, a number of clusters k, and
    a convergence parameter epsilon, as input, and return the representatives
    (means) as well as the clusters found using k-means.
    If the distance is the same between a point and more than one representative
    (mean), then assign the point to the mean corresponding to the cluster with
    the lowest index.
    """
    assert n_clusters > 0
    num_attr = len(data[0])
    centroids = np.zeros([n_clusters, num_attr])
    change = epsilon + 1
    labels = np.zeros([len(data)])
    sums_list = np.zeros([n_clusters])
    for i in range(num_attr):
        for j in range(n_clusters):
            centroids[j][i] = random.random() * (
                np.max(data[:, i]) - np.min(data[:, i])
            ) + np.min(data[:, i])
    while change > epsilon:
        last_centroids = np.ndarray.copy(centroids)
        for index, instance in enumerate(data):
            closest_index = 0
            distance = np.inf
            for label_index, centroid in enumerate(centroids):
                if (new_dist := np.linalg.norm(instance - centroid)) < distance:
                    closest_index = label_index
                    distance = new_dist
            labels[index] = closest_index

        centroids.fill(0)
        sums_list.fill(0)
        for index, label in enumerate(labels):
            for centroid_index, value in enumerate(data[index]):
                centroids[int(label), centroid_index] += value
            sums_list[int(label)] += 1
        for index, sum_value in enumerate(sums_list):
            for index2, value in enumerate(centroids[index]):
                if value != 0:
                    centroids[index][index2] = value / sum_value

        change = sum(
            [
                np.linalg.norm(centroids[x] - last_centroids[x])
                for x in range(n_clusters)
            ]
        )

    return centroids, labels


def dbscan(data: list, minpts: int, epsilon: float):
    """
    6. A function that implements the DBSCAN clustering algorithm.
    The function should take a data matrix and the parameters minpts and
    epsilon as input, and return the clusters found using DBSCAN,
    where each data point is labeled as either a noise point, a border point,
    or a core point.
    """
    core_points = set()
    border_points = {}
    noise_points = set()
    cluster_assignments = np.zeros([len(data)])
    i_deque = deque()
    group_index = 1

    close_enough_dict = {}
    for i in range(len(data)):
        noise_points.add(i)
        close_enough_dict[i] = []
    for i in range(len(data) - 1):
        for j in range(i + 1, len(data)):
            if (np.linalg.norm(data[i] - data[j])) <= epsilon:
                close_enough_dict[i] += [j]
                close_enough_dict[j] += [i]

    for index in range(len(data)):
        if index in core_points or len(close_enough_dict[index]) < minpts:
            continue

        if index in noise_points:
            noise_points.remove(index)
        i_deque.extend(close_enough_dict[index])
        core_points.add(index)
        cluster_assignments[index] = group_index

        while len(i_deque) > 0:
            new_index = i_deque.popleft()
            if new_index in noise_points:
                noise_points.remove(new_index)

            close_list = close_enough_dict[new_index]
            if len(close_list) >= minpts:
                for i in close_list:
                    if i not in i_deque and i not in core_points:
                        i_deque.append(i)
                core_points.add(new_index)
                cluster_assignments[new_index] = group_index
            else:
                if new_index in border_points.keys():
                    if len(close_list) > border_points[new_index]:
                        border_points[new_index] = len(close_list)
                        cluster_assignments[new_index] = group_index
                else:
                    border_points[new_index] = len(close_list)
                    cluster_assignments[new_index] = group_index

        group_index += 1

    for i in noise_points:
        cluster_assignments[i] = 0

    border_points_list = border_points.keys()

    return (
        sorted(list(core_points)),
        sorted(border_points_list),
        sorted(list(noise_points)),
        cluster_assignments,
    )


def precision(true_lab: list, clust_lab: list):
    """
    7. (EC) A function that computes the precision of a clustering.
    The function should take a list of true cluster labels and a list of the
    cluster labels returned by some clustering algorithm, and return the
    precision of the clustering.
    """
    assert len(true_lab) == len(clust_lab)
    true_lab = true_lab - min(true_lab)
    clust_lab = clust_lab - min(clust_lab)

    cont_table = np.zeros([max(clust_lab) + 1, max(true_lab) + 1])
    for index in range(len(true_lab)):
        cont_table[clust_lab[index]][true_lab[index]] += 1

    prec_array = []
    for index in range(len(cont_table)):
        prec_array += [np.max(cont_table[index]) / np.sum(cont_table[index])]

    return prec_array, sum(prec_array) / len(prec_array)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN


full_column_names=["id","refractive index","sodium","magnesium","aluminum","silicon","potassium","calcium","barium","iron","type"]
#    1. Id number: 1 to 214
#    2. RI: refractive index
#    3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
#    4. Mg: Magnesium
#    5. Al: Aluminum
#    6. Si: Silicon
#    7. K: Potassium
#    8. Ca: Calcium
#    9. Ba: Barium
#   10. Fe: Iron
#   11. Type of glass
glass_df = pd.read_csv("glass.data", names=full_column_names)
glass_df.drop(columns=["id"], inplace=True)
glass_type_array = glass_df["type"]
glass_type_array = [x - 1 if x > 4 else x for x in glass_type_array.to_numpy()]
glass_df.drop(columns=["type"], inplace=True)
column_names = full_column_names[1:-1]
# glass_df.head
# glass_type_array


# transform iris data using PCA and plot fraction of total variance preserved
pca = PCA(n_components=2)
pca_2_glass = pca.fit_transform(glass_df.to_numpy())
plt.scatter(pca_2_glass[:,0], pca_2_glass[:,1])
plt.xlabel("First Attribute")
plt.ylabel("Second Attribute")
plt.show()


# transform iris data using PCA and plot fraction of total variance preserved
pca = PCA()
pca_full_glass = pca.fit_transform(glass_df.to_numpy())
plt.plot(range(1,pca_full_glass.shape[1] + 1), np.cumsum(pca.explained_variance_ratio_), marker='x')
plt.title('Glass data: fraction of total variance preserved by principal components')
plt.xlabel('r : the number of principal components')
plt.ylabel('f(r) : fraction of total variance preserved')
plt.show()


print(f"4 components seems like a good number to use")
print(f"The ratio of preserved variance is {np.cumsum(pca.explained_variance_ratio_)[3]}")
pca_glass_reduced = pca_full_glass[:,0:4]

4 components seems like a good number to use
The ratio of preserved variance is 0.9492230765479248


# original
kmeans_values_to_try = [2,3,4,5,6,7,8,9,10]
inertias_org = np.zeros(len(kmeans_values_to_try))
kmeans_org = []
for i in range(1, len(kmeans_values_to_try) + 1):
    kmeans = KMeans(n_clusters = kmeans_values_to_try[i-1], init='k-means++', max_iter=300, random_state=0)
    kmeans.fit_predict(glass_df.to_numpy())
    inertias_org[i-1] = kmeans.inertia_
    kmeans_org += [kmeans]


plt.plot(kmeans_values_to_try, inertias_org, c='b', marker='.')
plt.show()
print("    For the original data:")
for index, value in enumerate(kmeans_values_to_try):
    print(f"With a k of {value}, the value of the objective function is {inertias_org[index]}")

    For the original data:
With a k of 2, the value of the objective function is 819.6292544515811
With a k of 3, the value of the objective function is 589.0314496288756
With a k of 4, the value of the objective function is 494.68709047103204
With a k of 5, the value of the objective function is 402.97441326805455
With a k of 6, the value of the objective function is 336.26864985796294
With a k of 7, the value of the objective function is 292.3536803291611
With a k of 8, the value of the objective function is 268.13355983725194
With a k of 9, the value of the objective function is 245.3509223188667
With a k of 10, the value of the objective function is 230.54825087389395


inertias_reduced = np.zeros(len(kmeans_values_to_try))
kmeans_red = []
for i in range(1, len(kmeans_values_to_try) + 1):
    kmeans = KMeans(n_clusters = kmeans_values_to_try[i-1], init='k-means++', max_iter=300, random_state=0)
    kmeans.fit_predict(pca_glass_reduced)
    inertias_reduced[i-1] = kmeans.inertia_
    kmeans_red += [kmeans]


plt.plot(kmeans_values_to_try, inertias_reduced, c='b', marker='.')
plt.show()
print("    For the reduced dimensionality data:")
for index, value in enumerate(kmeans_values_to_try):
    print(f"With a k of {value}, the value of the objective function is {inertias_reduced[index]}")

    For the reduced dimensionality data:
With a k of 2, the value of the objective function is 751.4793704706781
With a k of 3, the value of the objective function is 521.1086597635691
With a k of 4, the value of the objective function is 425.0868049044277
With a k of 5, the value of the objective function is 334.8954027742835
With a k of 6, the value of the objective function is 276.29927714575257
With a k of 7, the value of the objective function is 242.09180768950813
With a k of 8, the value of the objective function is 218.2884380170536
With a k of 9, the value of the objective function is 198.4323988349524
With a k of 10, the value of the objective function is 183.81037363742666


# original minpts
minpts_vals_to_try = [1,2,3,4,5,6,7,8,9]
dbses_org_min = []
for index, val in enumerate(minpts_vals_to_try):
    dbs = DBSCAN(eps=0.6, min_samples=val)
    dbs.fit(glass_df.to_numpy())
    dbses_org_min += [dbs]

fig = plt.figure("minpts original", figsize=(15,15))
plt.suptitle(f"Original Minpts")
for index, val in enumerate(dbses_org_min):
    fig.add_subplot(3, 3, index + 1)
    plt.scatter(glass_df.to_numpy()[:,0], glass_df.to_numpy()[:,1], c=val.labels_)
    plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
    plt.xlabel('Attr1')
    plt.ylabel('Attr2')

plt.show()


# original eps
eps_vals_to_try = [0.4,0.5,0.6,0.7,0.8,0.9,1,1.2,1.4]
dbses_org_eps = []
for index, val in enumerate(eps_vals_to_try):
    dbs = DBSCAN(eps=val, min_samples=4)
    dbs.fit(glass_df.to_numpy())
    dbses_org_eps += [dbs]

fig = plt.figure("eps original", figsize=(15,15))
plt.suptitle(f"Original Eps")
for index, val in enumerate(dbses_org_eps):
    fig.add_subplot(3, 3, index + 1)
    plt.scatter(glass_df.to_numpy()[:,0], glass_df.to_numpy()[:,1], c=val.labels_)
    plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
    plt.xlabel('Attr1')
    plt.ylabel('Attr2')

plt.show()


# Reduced Dimensionality minpts
dbses_pca_min = []
for index, val in enumerate(minpts_vals_to_try):
    dbs = DBSCAN(eps=0.8, min_samples=val)
    dbs.fit(pca_glass_reduced)
    dbses_pca_min += [dbs]

fig = plt.figure("minpts reduced", figsize=(15,15))
plt.suptitle(f"Reduced Dimensionality Minpts")
for index, val in enumerate(dbses_pca_min):
    fig.add_subplot(3, 3, index + 1)
    plt.scatter(pca_glass_reduced[:,0], pca_glass_reduced[:,1], c=val.labels_)
    plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
    plt.xlabel('Attr1')
    plt.ylabel('Attr2')

plt.show()


# Reduced Dimensionality minpts
dbses_pca_eps = []
for index, val in enumerate(eps_vals_to_try):
    dbs = DBSCAN(eps=val, min_samples=3)
    dbs.fit(pca_glass_reduced)
    dbses_pca_eps += [dbs]

fig = plt.figure("eps reduced", figsize=(15,15))
plt.suptitle(f"Reduced Dimensionality Eps")
for index, val in enumerate(dbses_pca_eps):
    fig.add_subplot(3, 3, index + 1)
    plt.scatter(pca_glass_reduced[:,0], pca_glass_reduced[:,1], c=val.labels_)
    plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
    plt.xlabel('Attr1')
    plt.ylabel('Attr2')

plt.show()


import clust_funcs


fig = plt.figure(figsize=(15,10))
plt.suptitle(f"Precisions")

fig.add_subplot(2,3,1)
prec_list = [clust_funcs.precision(glass_type_array, kmeans_org[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(kmeans_values_to_try,  prec_list, c='b',marker='.')
plt.title("KMeans Original")
plt.xlabel("Number of Clusters")
plt.ylabel("Precision")

fig.add_subplot(2,3,4)
prec_list = [clust_funcs.precision(glass_type_array, kmeans_red[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(kmeans_values_to_try,  prec_list, c='b',marker='.')
plt.title("KMeans PCA Reduced")
plt.xlabel("Number of Clusters")
plt.ylabel("Precision")

fig.add_subplot(2,3,2)
prec_list = [clust_funcs.precision(glass_type_array, dbses_org_min[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(minpts_vals_to_try,  prec_list, c='b',marker='.')
plt.title("DBSCAN Minpts Original")
plt.xlabel("Minpts")
plt.ylabel("Precision")

fig.add_subplot(2,3,5)
prec_list = [clust_funcs.precision(glass_type_array, dbses_pca_min[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(minpts_vals_to_try,  prec_list, c='b',marker='.')
plt.title("DBSCAN Minpts Original")
plt.xlabel("Minpts")
plt.ylabel("Precision")

fig.add_subplot(2,3,3)
prec_list = [clust_funcs.precision(glass_type_array, dbses_org_eps[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(eps_vals_to_try,  prec_list, c='b',marker='.')
plt.title("DBSCAN Epsilon Original")
plt.xlabel("Epsilon")
plt.ylabel("Precision")

fig.add_subplot(2,3,6)
prec_list = [clust_funcs.precision(glass_type_array, dbses_pca_eps[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(eps_vals_to_try,  prec_list, c='b',marker='.')
plt.title("DBSCAN Epsilon Original")
plt.xlabel("Epsilon")
plt.ylabel("Precision")

plt.show()

Introduction¶

Functions¶

Analyses¶

12. (EC) Create a plot of clustering precision for each value of k used in question 10, each value of epsilon used in question 11, and each value of mints used in question 11, for both the original and reduced-dimensionality data.¶