This data was used to experiment with combining Instance-based and Model-based methods and was displayed in the 1983 American Statistical Association Exposition. It was originally uploaded by Quinlan R. from the University of Massachusetts in 1993.
There are nine attributes and 406 entities in the original dataset. The modified dataset has 398 entities, but I used the original dataset for this project. The first six attributes are numerical and the last 3 are categorical. The model year should be label encoded because it makes sense for a larger difference in model year to have more of a difference in the results. There is very little data about what origin represents. It is discrete and ranges from 1 to 3, and should probably be one hot encoded. Car Names should be one hot encoded. Car Names and Origin were removed from the analyses; There are 312 distinct names and they don't have much of an impact. The meaning of origin could not be determined, so it shouldn't influence the results.
import pandas as pd
import numpy as np
import data_funcs
mpg_org = pd.read_table("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original", delim_whitespace=True, names=["mpg","cylinders","displacement","horsepower","weight","acceleration","model year","origin","car name"])
print(mpg_org.isnull().sum())
print(f"\nOverall, 14/{406*9} values are missing")
print(f"8 out of 406 mpg values are missing")
print("6 out of 406 horsepower values are missing")
mpg 8 cylinders 0 displacement 0 horsepower 6 weight 0 acceleration 0 model year 0 origin 0 car name 0 dtype: int64 Overall, 14/3654 values are missing 8 out of 406 mpg values are missing 6 out of 406 horsepower values are missing
I've driven a variety of vehicles throughout my life and I always find it interesting how the manufacturers design them with different aspects in mind. The car I currently drive gets pretty good gas mileage on the highway, but not nearly as great in Bozeman. Meanwhile my partner's car gets almost as good gas mileage in Bozeman as it does on the highway. It will be interesting to see how different vehicle metrics affect each other.
Miles per gallon is the focus of the dataset, so I think it will be the most interesting. Horsepower, model year, and weight should also interact a lot with mpg should be the most important.
def attr_cov(vec1, vec2):
assert len(vec1) == len(vec2) # Must be same length
vec1_mean = calc_mean([vec1])
vec2_mean = calc_mean([vec2])
return (
1
/ (len(vec1) - 1)
* sum([(vec1[i] - vec1_mean) * (vec2[i] - vec2_mean) for i in range(len(vec1))])
)
def attr_cor(vec1, vec2):
assert len(vec1) == len(vec2) # Must be same length
return attr_cov(vec1=vec1, vec2=vec2) / (
(attr_cov(vec1, vec1) * attr_cov(vec2, vec2)) ** 0.5
)
def range_normalize(pd_data):
pd_data_copy = np.copy(pd_data)
for column in range(len(pd_data[0])):
_min = min(pd_data[:, column])
_max = max(pd_data[:, column])
for row in pd_data_copy:
row[column] = (row[column] - _min) / (_max - _min)
return pd_data_copy
def standard_normalize(pd_data):
# TODO
pd_data_copy = np.copy(pd_data)
for column in range(len(pd_data[0])):
_mean = calc_mean([pd_data[:, column]])
_cov = np.sqrt(attr_cov(pd_data[:, column], pd_data[:, column]))
for row in pd_data_copy:
row[column] = (row[column] - _mean) / (_cov)
return pd_data_copy
def cov_mat(pd_data):
new_cov_mat = np.ndarray((len(pd_data[0]), len(pd_data[0])))
for column in range(len(pd_data[0])):
for column2 in range(len(pd_data[0])):
new_cov_mat[column][column2] = attr_cov(
pd_data[:, column], pd_data[:, column2]
)
return new_cov_mat
def lab_encode(pd_data):
pd_data_copy = np.copy(pd_data)
for column in range(len(pd_data[0])):
categories = []
for row in pd_data_copy:
if row[column] not in categories:
categories += [row[column]]
categories = sorted(categories)
cat_dict = {}
count = 0
for i in categories:
cat_dict[i] = count
count += 1
for row in pd_data_copy:
row[column] = cat_dict.get(row[column])
return pd_data_copy
There were two versions of this data set on the UCI Machine Learning Repository. One was the original dataset and the other had the instances with a null mpg removed. I chose to use the original dataset to fill in the null values with the mean.
# pd.get_dummies(mpg_org, prefix='name') # this was to one hot encode the names attribute
mpg_org = mpg_org.fillna(mpg_org.mean())
# Label encode model year
mpg_data = mpg_org.to_numpy()
mpg_data[:,6] = data_funcs.lab_encode(mpg_data)[:,6]
# Remove the names category. There are 312 different car names
# Also remove Origin because there is no explanation for what it means. It seems irrelevant
mpg_data = mpg_data[:,0:7]
print(mpg_data)
[[18.0 8.0 307.0 ... 3504.0 12.0 0] [15.0 8.0 350.0 ... 3693.0 11.5 0] [18.0 8.0 318.0 ... 3436.0 11.0 0] ... [32.0 4.0 135.0 ... 2295.0 11.6 12] [28.0 4.0 120.0 ... 2625.0 18.6 12] [31.0 4.0 119.0 ... 2720.0 19.4 12]]
# Multivariate Mean
data_funcs.calc_mean(mpg_data, axis=0)
[23.51457286432162, 5.475369458128079, 194.7795566502463, 105.08249999999997, 2979.4137931034484, 15.519704433497521, 5.921182266009852]
# Covariate Matrix
data_funcs.cov_mat(mpg_data)
array([[ 5.98829024e+01, -1.01052782e+01, -6.42456099e+02, -2.25733206e+02, -5.39646682e+03, 8.87998784e+00, 1.64104734e+01], [-1.01052782e+01, 2.93149060e+00, 1.70982829e+02, 5.53524630e+01, 1.29825466e+03, -2.50766162e+00, -2.31552636e+00], [-6.42456099e+02, 1.70982829e+02, 1.10087223e+04, 3.61240300e+03, 8.28688137e+04, -1.64122683e+02, -1.50138405e+02], [-2.25733206e+02, 5.53524630e+01, 3.61240300e+03, 1.48075130e+03, 2.81154182e+04, -7.46789154e+01, -6.05515309e+01], [-5.39646682e+03, 1.29825466e+03, 8.28688137e+04, 2.81154182e+04, 7.17416332e+05, -1.02122027e+03, -1.00142163e+03], [ 8.87998784e+00, -2.50766162e+00, -1.64122683e+02, -7.46789154e+01, -1.02122027e+03, 7.85882065e+00, 3.17365566e+00], [ 1.64104734e+01, -2.31552636e+00, -1.50138405e+02, -6.05515309e+01, -1.00142163e+03, 3.17365566e+00, 1.40530317e+01]])
import matplotlib.pyplot as plt
plt.scatter(mpg_data[:,1], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[1])
plt.ylabel(mpg_org.columns[0])
Text(0, 0.5, 'mpg')
The number of cylinders appears to be related to miles per gallon. This makes sense because each cylinder requires some amount of gasoline in order to fire. A car with more cylinders will have to pump more gas in order to saturate each cylinder.
plt.scatter(mpg_data[:,3], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[3])
plt.ylabel(mpg_org.columns[0])
Text(0, 0.5, 'mpg')
Horsepower also appears to be related to mpg. This makes sense because horsepower is a measure of the power that an engine can output. In order to get more power, more fuel needs to be used, which would decrease mpg. The scatterplot shows a fairly strong relationship.
plt.scatter(mpg_data[:,6], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[6])
plt.ylabel(mpg_org.columns[0])
Text(0, 0.5, 'mpg')
The scatter plot shows that mpg is related to model year. There is a lot of variation within each year, though mpg consistently goes up. This makes sense because car manufacturers will produce better cars over time as new advancements are made.
plt.scatter(mpg_data[:,4], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[4])
plt.ylabel(mpg_org.columns[0])
Text(0, 0.5, 'mpg')
Mpg appears to be related to weight. It requires more energy to move a heavier object than a lighter one. This data set was also collected from driving in the city where starts and stops are more common.
plt.scatter(mpg_data[:,4], mpg_data[:, 5])
plt.xlabel(mpg_org.columns[4])
plt.ylabel(mpg_org.columns[5])
Text(0, 0.5, 'acceleration')
These two are much less related than the other attribute pairs above. One would think that heavier cars would be harder to accelerate, but these vehicles are likely paired with more powerful engines.
range_norm_data = data_funcs.range_normalize(mpg_data)
range_norm_covariance = data_funcs.cov_mat(range_norm_data)
# print(range_norm_covariance)
rn_cov = []
for i in range(len(range_norm_covariance)):
for j in range(len(range_norm_covariance)):
if j <= i: # Do this to ignore repeats and variance of attribute
range_norm_covariance[i][j] = 0
rn_cov += [(range_norm_covariance[i][j], i, j)]
rn_cov = sorted(rn_cov, key=lambda x: abs(x[0]), reverse=True) # sorts by covariance
top_3_rn = np.asarray([[rn_cov[0][1], rn_cov[0][2]],[rn_cov[1][1], rn_cov[1][2]],[rn_cov[2][1], rn_cov[2][2]],]) # Make nice array for indexes of top three
print(top_3_rn)
print(type(top_3_rn))
[[1 2] [1 4] [2 4]] <class 'numpy.ndarray'>
# print(covariance)
print("For the range normalized data, the top three covariances are:\n")
for i in range(len(top_3_rn)):
print(f"Covariance between {mpg_org.columns[top_3_rn[i,0]]} and {mpg_org.columns[top_3_rn[i,1]]} is {range_norm_covariance[top_3_rn[i,0],top_3_rn[i,1]]}")
For the range normalized data, the top three covariances are: Covariance between cylinders and displacement is 0.08836321889367729 Covariance between cylinders and weight is 0.07361806983601418 Covariance between displacement and weight is 0.06071202196227808
# Top three pairs scatterplots
fig = plt.figure("Full Deal")
plt.suptitle(f"Top 3 Covariance")
# show first image
ax = fig.add_subplot(1, 3, 1)
plt.scatter(range_norm_data[:,top_3_rn[0,0]], range_norm_data[:,top_3_rn[0,1]])
plt.xlabel(mpg_org.columns[top_3_rn[0,0]])
plt.ylabel(mpg_org.columns[top_3_rn[0,1]])
# plt.title(f"{mpg_org.columns[top_3_rn[0,0]]} vs. {mpg_org.columns[top_3_rn[0,1]]}")
ax = fig.add_subplot(1, 3, 2)
plt.scatter(range_norm_data[:,top_3_rn[1,0]], range_norm_data[:,top_3_rn[1,1]])
plt.xlabel(mpg_org.columns[top_3_rn[1,0]])
plt.ylabel(mpg_org.columns[top_3_rn[1,1]])
# plt.title(f"{mpg_org.columns[top_3_rn[1,0]]} vs. {mpg_org.columns[top_3_rn[1,1]]}")
ax = fig.add_subplot(1, 3, 3)
plt.scatter(range_norm_data[:,top_3_rn[2,0]], range_norm_data[:,top_3_rn[2,1]])
plt.xlabel(mpg_org.columns[top_3_rn[2,0]])
plt.ylabel(mpg_org.columns[top_3_rn[2,1]])
# plt.title(f"{mpg_org.columns[top_3_rn[2,0]]} vs. {mpg_org.columns[top_3_rn[2,1]]}")
plt.subplots_adjust(wspace=0.9)
plt.show()
Z_norm_data = data_funcs.standard_normalize(mpg_data)
corr_dict = {}
for i in range(len(Z_norm_data[0])):
for j in range(i + 1, len(Z_norm_data[0])):
corr_dict[data_funcs.attr_cor(Z_norm_data[:,i], Z_norm_data[:,j])] = [i,j] # Dict of correlation and indexes
sorted_keys = sorted(corr_dict.keys(), key=abs, reverse=True)
print("For the Z-Score-Normalized data, the top three Correlated pairs are:\n")
for i in range(3):
print(f"Correlation between {mpg_org.columns[corr_dict.get(sorted_keys[i])[0]]} and {mpg_org.columns[corr_dict.get(sorted_keys[i])[1]]} is {sorted_keys[i]}")
For the Z-Score-Normalized data, the top three Correlated pairs are: Correlation between cylinders and displacement is 0.9517870386276686 Correlation between displacement and weight is 0.9324746504807468 Correlation between cylinders and weight is 0.8952204775128856
# Top three pairs scatterplots
fig = plt.figure("Full Deal")
plt.suptitle(f"Top 3 Z-Score-Normalized Correlation")
# show first image
ax = fig.add_subplot(1, 3, 1)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[0])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[0])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[0,0]]} vs. {mpg_org.columns[top_3_rn[0,1]]}")
ax = fig.add_subplot(1, 3, 2)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[1])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[1])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[1,0]]} vs. {mpg_org.columns[top_3_rn[1,1]]}")
ax = fig.add_subplot(1, 3, 3)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[2])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[2])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[2,0]]} vs. {mpg_org.columns[top_3_rn[2,1]]}")
plt.subplots_adjust(wspace=0.9)
plt.show()
sorted_keys = sorted_keys[::-1]
print("For the Z-Score-Normalized data, the least three Correlated pairs are:\n")
for i in range(3):
print(f"Correlation between {mpg_org.columns[corr_dict.get(sorted_keys[i])[0]]} and {mpg_org.columns[corr_dict.get(sorted_keys[i])[1]]} is {sorted_keys[i]}")
For the Z-Score-Normalized data, the least three Correlated pairs are: Correlation between acceleration and model year is 0.3019924618258969 Correlation between weight and model year is -0.3153888413389182 Correlation between cylinders and model year is -0.36076173267892825
# Top three pairs scatterplots
fig = plt.figure("Full Deal")
plt.suptitle(f"Least 3 Z-Score-Normalized Correlation")
# show first image
ax = fig.add_subplot(1, 3, 1)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[0])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[0])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[0,0]]} vs. {mpg_org.columns[top_3_rn[0,1]]}")
ax = fig.add_subplot(1, 3, 2)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[1])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[1])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[1,0]]} vs. {mpg_org.columns[top_3_rn[1,1]]}")
ax = fig.add_subplot(1, 3, 3)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[2])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[2])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[2,0]]} vs. {mpg_org.columns[top_3_rn[2,1]]}")
plt.subplots_adjust(wspace=0.9)
plt.show()
sum = 0
corr_list = []
for i in range(len(mpg_data[0])):
for j in range(i + 1, len(mpg_data[0])):
corr_list += [data_funcs.attr_cor(mpg_data[:,i], mpg_data[:,j])]
if corr_list[-1] >= 0.5:
sum += 1
print(f"There are {sum} pairs of features that have correlation greater than or equal to 0.5")
There are 7 pairs of features that have correlation greater than or equal to 0.5
sum = 0
for i in range(len(mpg_data[0])):
for j in range(i + 1, len(mpg_data[0])):
if data_funcs.attr_cov(mpg_data[:,i], mpg_data[:,j]) < 0:
sum += 1
print(f"There are {sum} pairs of features that have negative sample covariance")
There are 12 pairs of features that have negative sample covariance
sum = 0
var_list = []
for i in range(len(mpg_data[0])):
var_list += [data_funcs.attr_cov(mpg_data[:,i], mpg_data[:,i])]
sum += var_list[-1]
print(f"The total variance of the data is {sum}")
The total variance of the data is 729990.5318764142
sum = 0
var_list = sorted(var_list, reverse=True)
for i in range(5):
sum += var_list[i]
print(f"The total variance of the data restricted to the five features that have the greatest sample variance is {sum}")
The total variance of the data restricted to the five features that have the greatest sample variance is 729979.7415651571