https://archive.ics.uci.edu/ml/datasets/Glass+Identification
Glass is always a fun substance and it is everywhere around us. I think it is interesting to know a little more about the composition of such a common substance and how different physical properites lend themselves to different use cases. There are 9 numerical attributes and a 10th categorical attribute that represents the type of glass. There are no missing values in this data set.
There are six different types of glass in this dataset, so I would expect there to be six clusters. If the clusters are distinct, it would mean the types of glass have distinct characteristics. If clusters are not distinct, the types of glass are quite similar, so the type of glass may not matter as much as its application or use. I'm not sure the glass characteristics are distinct or there may be a lot of variance. I think there will be between three and six clusters. The clusters should not be the same size. Glass types five and six have less instances than one, two, or seven.
import random
import numpy as np
from collections import deque
def kmeans(data: list, n_clusters: int, epsilon: float):
"""
5. A function that implements the k-means clustering algorithm.
The function should take a data matrix, a number of clusters k, and
a convergence parameter epsilon, as input, and return the representatives
(means) as well as the clusters found using k-means.
If the distance is the same between a point and more than one representative
(mean), then assign the point to the mean corresponding to the cluster with
the lowest index.
"""
assert n_clusters > 0
num_attr = len(data[0])
centroids = np.zeros([n_clusters, num_attr])
change = epsilon + 1
labels = np.zeros([len(data)])
sums_list = np.zeros([n_clusters])
for i in range(num_attr):
for j in range(n_clusters):
centroids[j][i] = random.random() * (
np.max(data[:, i]) - np.min(data[:, i])
) + np.min(data[:, i])
while change > epsilon:
last_centroids = np.ndarray.copy(centroids)
for index, instance in enumerate(data):
closest_index = 0
distance = np.inf
for label_index, centroid in enumerate(centroids):
if (new_dist := np.linalg.norm(instance - centroid)) < distance:
closest_index = label_index
distance = new_dist
labels[index] = closest_index
centroids.fill(0)
sums_list.fill(0)
for index, label in enumerate(labels):
for centroid_index, value in enumerate(data[index]):
centroids[int(label), centroid_index] += value
sums_list[int(label)] += 1
for index, sum_value in enumerate(sums_list):
for index2, value in enumerate(centroids[index]):
if value != 0:
centroids[index][index2] = value / sum_value
change = sum(
[
np.linalg.norm(centroids[x] - last_centroids[x])
for x in range(n_clusters)
]
)
return centroids, labels
def dbscan(data: list, minpts: int, epsilon: float):
"""
6. A function that implements the DBSCAN clustering algorithm.
The function should take a data matrix and the parameters minpts and
epsilon as input, and return the clusters found using DBSCAN,
where each data point is labeled as either a noise point, a border point,
or a core point.
"""
core_points = set()
border_points = {}
noise_points = set()
cluster_assignments = np.zeros([len(data)])
i_deque = deque()
group_index = 1
close_enough_dict = {}
for i in range(len(data)):
noise_points.add(i)
close_enough_dict[i] = []
for i in range(len(data) - 1):
for j in range(i + 1, len(data)):
if (np.linalg.norm(data[i] - data[j])) <= epsilon:
close_enough_dict[i] += [j]
close_enough_dict[j] += [i]
for index in range(len(data)):
if index in core_points or len(close_enough_dict[index]) < minpts:
continue
if index in noise_points:
noise_points.remove(index)
i_deque.extend(close_enough_dict[index])
core_points.add(index)
cluster_assignments[index] = group_index
while len(i_deque) > 0:
new_index = i_deque.popleft()
if new_index in noise_points:
noise_points.remove(new_index)
close_list = close_enough_dict[new_index]
if len(close_list) >= minpts:
for i in close_list:
if i not in i_deque and i not in core_points:
i_deque.append(i)
core_points.add(new_index)
cluster_assignments[new_index] = group_index
else:
if new_index in border_points.keys():
if len(close_list) > border_points[new_index]:
border_points[new_index] = len(close_list)
cluster_assignments[new_index] = group_index
else:
border_points[new_index] = len(close_list)
cluster_assignments[new_index] = group_index
group_index += 1
for i in noise_points:
cluster_assignments[i] = 0
border_points_list = border_points.keys()
return (
sorted(list(core_points)),
sorted(border_points_list),
sorted(list(noise_points)),
cluster_assignments,
)
def precision(true_lab: list, clust_lab: list):
"""
7. (EC) A function that computes the precision of a clustering.
The function should take a list of true cluster labels and a list of the
cluster labels returned by some clustering algorithm, and return the
precision of the clustering.
"""
assert len(true_lab) == len(clust_lab)
true_lab = true_lab - min(true_lab)
clust_lab = clust_lab - min(clust_lab)
cont_table = np.zeros([max(clust_lab) + 1, max(true_lab) + 1])
for index in range(len(true_lab)):
cont_table[clust_lab[index]][true_lab[index]] += 1
prec_array = []
for index in range(len(cont_table)):
prec_array += [np.max(cont_table[index]) / np.sum(cont_table[index])]
return prec_array, sum(prec_array) / len(prec_array)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
full_column_names=["id","refractive index","sodium","magnesium","aluminum","silicon","potassium","calcium","barium","iron","type"]
# 1. Id number: 1 to 214
# 2. RI: refractive index
# 3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
# 4. Mg: Magnesium
# 5. Al: Aluminum
# 6. Si: Silicon
# 7. K: Potassium
# 8. Ca: Calcium
# 9. Ba: Barium
# 10. Fe: Iron
# 11. Type of glass
glass_df = pd.read_csv("glass.data", names=full_column_names)
glass_df.drop(columns=["id"], inplace=True)
glass_type_array = glass_df["type"]
glass_type_array = [x - 1 if x > 4 else x for x in glass_type_array.to_numpy()]
glass_df.drop(columns=["type"], inplace=True)
column_names = full_column_names[1:-1]
# glass_df.head
# glass_type_array
# transform iris data using PCA and plot fraction of total variance preserved
pca = PCA(n_components=2)
pca_2_glass = pca.fit_transform(glass_df.to_numpy())
plt.scatter(pca_2_glass[:,0], pca_2_glass[:,1])
plt.xlabel("First Attribute")
plt.ylabel("Second Attribute")
plt.show()
It looks like there are one or two definite clusters, though there could be a couple smaller ones along the line.
# transform iris data using PCA and plot fraction of total variance preserved
pca = PCA()
pca_full_glass = pca.fit_transform(glass_df.to_numpy())
plt.plot(range(1,pca_full_glass.shape[1] + 1), np.cumsum(pca.explained_variance_ratio_), marker='x')
plt.title('Glass data: fraction of total variance preserved by principal components')
plt.xlabel('r : the number of principal components')
plt.ylabel('f(r) : fraction of total variance preserved')
plt.show()
print(f"4 components seems like a good number to use")
print(f"The ratio of preserved variance is {np.cumsum(pca.explained_variance_ratio_)[3]}")
pca_glass_reduced = pca_full_glass[:,0:4]
4 components seems like a good number to use The ratio of preserved variance is 0.9492230765479248
# original
kmeans_values_to_try = [2,3,4,5,6,7,8,9,10]
inertias_org = np.zeros(len(kmeans_values_to_try))
kmeans_org = []
for i in range(1, len(kmeans_values_to_try) + 1):
kmeans = KMeans(n_clusters = kmeans_values_to_try[i-1], init='k-means++', max_iter=300, random_state=0)
kmeans.fit_predict(glass_df.to_numpy())
inertias_org[i-1] = kmeans.inertia_
kmeans_org += [kmeans]
plt.plot(kmeans_values_to_try, inertias_org, c='b', marker='.')
plt.show()
print(" For the original data:")
for index, value in enumerate(kmeans_values_to_try):
print(f"With a k of {value}, the value of the objective function is {inertias_org[index]}")
For the original data: With a k of 2, the value of the objective function is 819.6292544515811 With a k of 3, the value of the objective function is 589.0314496288756 With a k of 4, the value of the objective function is 494.68709047103204 With a k of 5, the value of the objective function is 402.97441326805455 With a k of 6, the value of the objective function is 336.26864985796294 With a k of 7, the value of the objective function is 292.3536803291611 With a k of 8, the value of the objective function is 268.13355983725194 With a k of 9, the value of the objective function is 245.3509223188667 With a k of 10, the value of the objective function is 230.54825087389395
inertias_reduced = np.zeros(len(kmeans_values_to_try))
kmeans_red = []
for i in range(1, len(kmeans_values_to_try) + 1):
kmeans = KMeans(n_clusters = kmeans_values_to_try[i-1], init='k-means++', max_iter=300, random_state=0)
kmeans.fit_predict(pca_glass_reduced)
inertias_reduced[i-1] = kmeans.inertia_
kmeans_red += [kmeans]
plt.plot(kmeans_values_to_try, inertias_reduced, c='b', marker='.')
plt.show()
print(" For the reduced dimensionality data:")
for index, value in enumerate(kmeans_values_to_try):
print(f"With a k of {value}, the value of the objective function is {inertias_reduced[index]}")
For the reduced dimensionality data: With a k of 2, the value of the objective function is 751.4793704706781 With a k of 3, the value of the objective function is 521.1086597635691 With a k of 4, the value of the objective function is 425.0868049044277 With a k of 5, the value of the objective function is 334.8954027742835 With a k of 6, the value of the objective function is 276.29927714575257 With a k of 7, the value of the objective function is 242.09180768950813 With a k of 8, the value of the objective function is 218.2884380170536 With a k of 9, the value of the objective function is 198.4323988349524 With a k of 10, the value of the objective function is 183.81037363742666
# original minpts
minpts_vals_to_try = [1,2,3,4,5,6,7,8,9]
dbses_org_min = []
for index, val in enumerate(minpts_vals_to_try):
dbs = DBSCAN(eps=0.6, min_samples=val)
dbs.fit(glass_df.to_numpy())
dbses_org_min += [dbs]
fig = plt.figure("minpts original", figsize=(15,15))
plt.suptitle(f"Original Minpts")
for index, val in enumerate(dbses_org_min):
fig.add_subplot(3, 3, index + 1)
plt.scatter(glass_df.to_numpy()[:,0], glass_df.to_numpy()[:,1], c=val.labels_)
plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
plt.xlabel('Attr1')
plt.ylabel('Attr2')
plt.show()
# original eps
eps_vals_to_try = [0.4,0.5,0.6,0.7,0.8,0.9,1,1.2,1.4]
dbses_org_eps = []
for index, val in enumerate(eps_vals_to_try):
dbs = DBSCAN(eps=val, min_samples=4)
dbs.fit(glass_df.to_numpy())
dbses_org_eps += [dbs]
fig = plt.figure("eps original", figsize=(15,15))
plt.suptitle(f"Original Eps")
for index, val in enumerate(dbses_org_eps):
fig.add_subplot(3, 3, index + 1)
plt.scatter(glass_df.to_numpy()[:,0], glass_df.to_numpy()[:,1], c=val.labels_)
plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
plt.xlabel('Attr1')
plt.ylabel('Attr2')
plt.show()
# Reduced Dimensionality minpts
dbses_pca_min = []
for index, val in enumerate(minpts_vals_to_try):
dbs = DBSCAN(eps=0.8, min_samples=val)
dbs.fit(pca_glass_reduced)
dbses_pca_min += [dbs]
fig = plt.figure("minpts reduced", figsize=(15,15))
plt.suptitle(f"Reduced Dimensionality Minpts")
for index, val in enumerate(dbses_pca_min):
fig.add_subplot(3, 3, index + 1)
plt.scatter(pca_glass_reduced[:,0], pca_glass_reduced[:,1], c=val.labels_)
plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
plt.xlabel('Attr1')
plt.ylabel('Attr2')
plt.show()
# Reduced Dimensionality minpts
dbses_pca_eps = []
for index, val in enumerate(eps_vals_to_try):
dbs = DBSCAN(eps=val, min_samples=3)
dbs.fit(pca_glass_reduced)
dbses_pca_eps += [dbs]
fig = plt.figure("eps reduced", figsize=(15,15))
plt.suptitle(f"Reduced Dimensionality Eps")
for index, val in enumerate(dbses_pca_eps):
fig.add_subplot(3, 3, index + 1)
plt.scatter(pca_glass_reduced[:,0], pca_glass_reduced[:,1], c=val.labels_)
plt.title(f"Eps: {val.eps} | minpts: {val.min_samples} | clusters found: {len(set(val.labels_))}")
plt.xlabel('Attr1')
plt.ylabel('Attr2')
plt.show()
import clust_funcs
fig = plt.figure(figsize=(15,10))
plt.suptitle(f"Precisions")
fig.add_subplot(2,3,1)
prec_list = [clust_funcs.precision(glass_type_array, kmeans_org[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(kmeans_values_to_try, prec_list, c='b',marker='.')
plt.title("KMeans Original")
plt.xlabel("Number of Clusters")
plt.ylabel("Precision")
fig.add_subplot(2,3,4)
prec_list = [clust_funcs.precision(glass_type_array, kmeans_red[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(kmeans_values_to_try, prec_list, c='b',marker='.')
plt.title("KMeans PCA Reduced")
plt.xlabel("Number of Clusters")
plt.ylabel("Precision")
fig.add_subplot(2,3,2)
prec_list = [clust_funcs.precision(glass_type_array, dbses_org_min[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(minpts_vals_to_try, prec_list, c='b',marker='.')
plt.title("DBSCAN Minpts Original")
plt.xlabel("Minpts")
plt.ylabel("Precision")
fig.add_subplot(2,3,5)
prec_list = [clust_funcs.precision(glass_type_array, dbses_pca_min[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(minpts_vals_to_try, prec_list, c='b',marker='.')
plt.title("DBSCAN Minpts Original")
plt.xlabel("Minpts")
plt.ylabel("Precision")
fig.add_subplot(2,3,3)
prec_list = [clust_funcs.precision(glass_type_array, dbses_org_eps[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(eps_vals_to_try, prec_list, c='b',marker='.')
plt.title("DBSCAN Epsilon Original")
plt.xlabel("Epsilon")
plt.ylabel("Precision")
fig.add_subplot(2,3,6)
prec_list = [clust_funcs.precision(glass_type_array, dbses_pca_eps[i].labels_)[1] for i in range(len(kmeans_org))]
plt.plot(eps_vals_to_try, prec_list, c='b',marker='.')
plt.title("DBSCAN Epsilon Original")
plt.xlabel("Epsilon")
plt.ylabel("Precision")
plt.show()