import pandas as pd
import numpy as np
import data_funcs

mpg_org = pd.read_table("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original", delim_whitespace=True, names=["mpg","cylinders","displacement","horsepower","weight","acceleration","model year","origin","car name"])

print(mpg_org.isnull().sum())
print(f"\nOverall, 14/{406*9} values are missing")
print(f"8 out of 406 mpg values are missing")
print("6 out of 406 horsepower values are missing")

mpg             8
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

Overall, 14/3654 values are missing
8 out of 406 mpg values are missing
6 out of 406 horsepower values are missing


def attr_cov(vec1, vec2):
    assert len(vec1) == len(vec2)  # Must be same length
    vec1_mean = calc_mean([vec1])
    vec2_mean = calc_mean([vec2])

    return (
        1
        / (len(vec1) - 1)
        * sum([(vec1[i] - vec1_mean) * (vec2[i] - vec2_mean) for i in range(len(vec1))])
    )


def attr_cor(vec1, vec2):
    assert len(vec1) == len(vec2)  # Must be same length
    return attr_cov(vec1=vec1, vec2=vec2) / (
        (attr_cov(vec1, vec1) * attr_cov(vec2, vec2)) ** 0.5
    )


def range_normalize(pd_data):
    pd_data_copy = np.copy(pd_data)
    for column in range(len(pd_data[0])):
        _min = min(pd_data[:, column])
        _max = max(pd_data[:, column])
        for row in pd_data_copy:
            row[column] = (row[column] - _min) / (_max - _min)

    return pd_data_copy


def standard_normalize(pd_data):
    # TODO
    pd_data_copy = np.copy(pd_data)
    for column in range(len(pd_data[0])):
        _mean = calc_mean([pd_data[:, column]])
        _cov = np.sqrt(attr_cov(pd_data[:, column], pd_data[:, column]))
        for row in pd_data_copy:
            row[column] = (row[column] - _mean) / (_cov)

    return pd_data_copy


def cov_mat(pd_data):

    new_cov_mat = np.ndarray((len(pd_data[0]), len(pd_data[0])))
    for column in range(len(pd_data[0])):
        for column2 in range(len(pd_data[0])):
            new_cov_mat[column][column2] = attr_cov(
                pd_data[:, column], pd_data[:, column2]
            )

    return new_cov_mat


def lab_encode(pd_data):
    pd_data_copy = np.copy(pd_data)
    for column in range(len(pd_data[0])):
        categories = []
        for row in pd_data_copy:
            if row[column] not in categories:
                categories += [row[column]]
        categories = sorted(categories)

        cat_dict = {}
        count = 0
        for i in categories:
            cat_dict[i] = count
            count += 1
        for row in pd_data_copy:
            row[column] = cat_dict.get(row[column])

    return pd_data_copy


# pd.get_dummies(mpg_org, prefix='name') # this was to one hot encode the names attribute
mpg_org = mpg_org.fillna(mpg_org.mean())


# Label encode model year
mpg_data = mpg_org.to_numpy()
mpg_data[:,6] = data_funcs.lab_encode(mpg_data)[:,6]

# Remove the names category. There are 312 different car names
# Also remove Origin because there is no explanation for what it means. It seems irrelevant
mpg_data = mpg_data[:,0:7]
print(mpg_data)

[[18.0 8.0 307.0 ... 3504.0 12.0 0]
 [15.0 8.0 350.0 ... 3693.0 11.5 0]
 [18.0 8.0 318.0 ... 3436.0 11.0 0]
 ...
 [32.0 4.0 135.0 ... 2295.0 11.6 12]
 [28.0 4.0 120.0 ... 2625.0 18.6 12]
 [31.0 4.0 119.0 ... 2720.0 19.4 12]]


# Multivariate Mean
data_funcs.calc_mean(mpg_data, axis=0)

[23.51457286432162,
 5.475369458128079,
 194.7795566502463,
 105.08249999999997,
 2979.4137931034484,
 15.519704433497521,
 5.921182266009852]


# Covariate Matrix
data_funcs.cov_mat(mpg_data)

array([[ 5.98829024e+01, -1.01052782e+01, -6.42456099e+02,
        -2.25733206e+02, -5.39646682e+03,  8.87998784e+00,
         1.64104734e+01],
       [-1.01052782e+01,  2.93149060e+00,  1.70982829e+02,
         5.53524630e+01,  1.29825466e+03, -2.50766162e+00,
        -2.31552636e+00],
       [-6.42456099e+02,  1.70982829e+02,  1.10087223e+04,
         3.61240300e+03,  8.28688137e+04, -1.64122683e+02,
        -1.50138405e+02],
       [-2.25733206e+02,  5.53524630e+01,  3.61240300e+03,
         1.48075130e+03,  2.81154182e+04, -7.46789154e+01,
        -6.05515309e+01],
       [-5.39646682e+03,  1.29825466e+03,  8.28688137e+04,
         2.81154182e+04,  7.17416332e+05, -1.02122027e+03,
        -1.00142163e+03],
       [ 8.87998784e+00, -2.50766162e+00, -1.64122683e+02,
        -7.46789154e+01, -1.02122027e+03,  7.85882065e+00,
         3.17365566e+00],
       [ 1.64104734e+01, -2.31552636e+00, -1.50138405e+02,
        -6.05515309e+01, -1.00142163e+03,  3.17365566e+00,
         1.40530317e+01]])


import matplotlib.pyplot as plt


plt.scatter(mpg_data[:,1], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[1])
plt.ylabel(mpg_org.columns[0])

Text(0, 0.5, 'mpg')


plt.scatter(mpg_data[:,3], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[3])
plt.ylabel(mpg_org.columns[0])

Text(0, 0.5, 'mpg')


plt.scatter(mpg_data[:,6], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[6])
plt.ylabel(mpg_org.columns[0])

Text(0, 0.5, 'mpg')


plt.scatter(mpg_data[:,4], mpg_data[:, 0])
plt.xlabel(mpg_org.columns[4])
plt.ylabel(mpg_org.columns[0])

Text(0, 0.5, 'mpg')


plt.scatter(mpg_data[:,4], mpg_data[:, 5])
plt.xlabel(mpg_org.columns[4])
plt.ylabel(mpg_org.columns[5])

Text(0, 0.5, 'acceleration')


range_norm_data = data_funcs.range_normalize(mpg_data)
range_norm_covariance = data_funcs.cov_mat(range_norm_data)
# print(range_norm_covariance)


rn_cov = []
for i in range(len(range_norm_covariance)):
    for j in range(len(range_norm_covariance)):
        if j <= i: # Do this to ignore repeats and variance of attribute
            range_norm_covariance[i][j] = 0
        rn_cov += [(range_norm_covariance[i][j], i, j)]
rn_cov = sorted(rn_cov, key=lambda x: abs(x[0]), reverse=True) # sorts by covariance
top_3_rn = np.asarray([[rn_cov[0][1], rn_cov[0][2]],[rn_cov[1][1], rn_cov[1][2]],[rn_cov[2][1], rn_cov[2][2]],]) # Make nice array for indexes of top three

print(top_3_rn)
print(type(top_3_rn))

[[1 2]
 [1 4]
 [2 4]]
<class 'numpy.ndarray'>


# print(covariance)
print("For the range normalized data, the top three covariances are:\n")
for i in range(len(top_3_rn)):
    print(f"Covariance between {mpg_org.columns[top_3_rn[i,0]]} and {mpg_org.columns[top_3_rn[i,1]]} is {range_norm_covariance[top_3_rn[i,0],top_3_rn[i,1]]}")

For the range normalized data, the top three covariances are:

Covariance between cylinders and displacement is 0.08836321889367729
Covariance between cylinders and weight is 0.07361806983601418
Covariance between displacement and weight is 0.06071202196227808


# Top three pairs scatterplots
fig = plt.figure("Full Deal")
plt.suptitle(f"Top 3 Covariance")
    # show first image
ax = fig.add_subplot(1, 3, 1)
plt.scatter(range_norm_data[:,top_3_rn[0,0]], range_norm_data[:,top_3_rn[0,1]])
plt.xlabel(mpg_org.columns[top_3_rn[0,0]])
plt.ylabel(mpg_org.columns[top_3_rn[0,1]])
# plt.title(f"{mpg_org.columns[top_3_rn[0,0]]} vs. {mpg_org.columns[top_3_rn[0,1]]}")

ax = fig.add_subplot(1, 3, 2)
plt.scatter(range_norm_data[:,top_3_rn[1,0]], range_norm_data[:,top_3_rn[1,1]])
plt.xlabel(mpg_org.columns[top_3_rn[1,0]])
plt.ylabel(mpg_org.columns[top_3_rn[1,1]])
# plt.title(f"{mpg_org.columns[top_3_rn[1,0]]} vs. {mpg_org.columns[top_3_rn[1,1]]}")

ax = fig.add_subplot(1, 3, 3)
plt.scatter(range_norm_data[:,top_3_rn[2,0]], range_norm_data[:,top_3_rn[2,1]])
plt.xlabel(mpg_org.columns[top_3_rn[2,0]])
plt.ylabel(mpg_org.columns[top_3_rn[2,1]])
# plt.title(f"{mpg_org.columns[top_3_rn[2,0]]} vs. {mpg_org.columns[top_3_rn[2,1]]}")

plt.subplots_adjust(wspace=0.9)

plt.show()


Z_norm_data = data_funcs.standard_normalize(mpg_data)
corr_dict = {}
for i in range(len(Z_norm_data[0])):
    for j in range(i + 1, len(Z_norm_data[0])):
        corr_dict[data_funcs.attr_cor(Z_norm_data[:,i], Z_norm_data[:,j])] = [i,j] # Dict of correlation and indexes
sorted_keys = sorted(corr_dict.keys(), key=abs, reverse=True)


print("For the Z-Score-Normalized data, the top three Correlated pairs are:\n")

for i in range(3):
    print(f"Correlation between {mpg_org.columns[corr_dict.get(sorted_keys[i])[0]]} and {mpg_org.columns[corr_dict.get(sorted_keys[i])[1]]} is {sorted_keys[i]}")

For the Z-Score-Normalized data, the top three Correlated pairs are:

Correlation between cylinders and displacement is 0.9517870386276686
Correlation between displacement and weight is 0.9324746504807468
Correlation between cylinders and weight is 0.8952204775128856


# Top three pairs scatterplots
fig = plt.figure("Full Deal")
plt.suptitle(f"Top 3 Z-Score-Normalized Correlation")
    # show first image
ax = fig.add_subplot(1, 3, 1)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[0])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[0])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[0,0]]} vs. {mpg_org.columns[top_3_rn[0,1]]}")

ax = fig.add_subplot(1, 3, 2)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[1])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[1])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[1,0]]} vs. {mpg_org.columns[top_3_rn[1,1]]}")

ax = fig.add_subplot(1, 3, 3)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[2])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[2])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[2,0]]} vs. {mpg_org.columns[top_3_rn[2,1]]}")

plt.subplots_adjust(wspace=0.9)

plt.show()


sorted_keys = sorted_keys[::-1]
print("For the Z-Score-Normalized data, the least three Correlated pairs are:\n")

for i in range(3):
    print(f"Correlation between {mpg_org.columns[corr_dict.get(sorted_keys[i])[0]]} and {mpg_org.columns[corr_dict.get(sorted_keys[i])[1]]} is {sorted_keys[i]}")

For the Z-Score-Normalized data, the least three Correlated pairs are:

Correlation between acceleration and model year is 0.3019924618258969
Correlation between weight and model year is -0.3153888413389182
Correlation between cylinders and model year is -0.36076173267892825


# Top three pairs scatterplots
fig = plt.figure("Full Deal")
plt.suptitle(f"Least 3 Z-Score-Normalized Correlation")
    # show first image
ax = fig.add_subplot(1, 3, 1)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[0])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[0])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[0])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[0,0]]} vs. {mpg_org.columns[top_3_rn[0,1]]}")

ax = fig.add_subplot(1, 3, 2)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[1])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[1])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[1])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[1,0]]} vs. {mpg_org.columns[top_3_rn[1,1]]}")

ax = fig.add_subplot(1, 3, 3)
plt.scatter(Z_norm_data[:,corr_dict.get(sorted_keys[2])[0]], Z_norm_data[:,corr_dict.get(sorted_keys[2])[1]])
plt.xlabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[0]])
plt.ylabel(mpg_org.columns[corr_dict.get(sorted_keys[2])[1]])
# plt.title(f"{mpg_org.columns[top_3_rn[2,0]]} vs. {mpg_org.columns[top_3_rn[2,1]]}")

plt.subplots_adjust(wspace=0.9)

plt.show()


sum = 0
corr_list = []
for i in range(len(mpg_data[0])):
    for j in range(i + 1, len(mpg_data[0])):
        corr_list += [data_funcs.attr_cor(mpg_data[:,i], mpg_data[:,j])]
        if corr_list[-1] >= 0.5:
            sum += 1
print(f"There are {sum} pairs of features that have correlation greater than or equal to 0.5")

There are 7 pairs of features that have correlation greater than or equal to 0.5


sum = 0
for i in range(len(mpg_data[0])):
    for j in range(i + 1, len(mpg_data[0])):
        if data_funcs.attr_cov(mpg_data[:,i], mpg_data[:,j]) < 0:
            sum += 1

print(f"There are {sum} pairs of features that have negative sample covariance")

There are 12 pairs of features that have negative sample covariance


sum = 0
var_list = []
for i in range(len(mpg_data[0])):
        var_list += [data_funcs.attr_cov(mpg_data[:,i], mpg_data[:,i])]
        sum += var_list[-1]

print(f"The total variance of the data is {sum}")

The total variance of the data is 729990.5318764142


sum = 0
var_list = sorted(var_list, reverse=True)
for i in range(5):
    sum += var_list[i]

print(f"The total variance of the data restricted to the five features that have the greatest sample variance is {sum}")

The total variance of the data restricted to the five features that have the greatest sample variance is 729979.7415651571

Introduction¶

Functions¶

Analyses¶

Preprocessing¶

What is the multivariate mean of the numerical data matrix (where categorical data have been converted to numerical values)?¶

What is the covariance matrix of the numerical data matrix (where categorical data have been converted to numerical values)?¶

mpg vs cylinders¶

mpg vs horsepower¶

mpg vs model year¶

mpg vs weight¶

acceleration vs weight¶

Which range-normalized numerical attributes have the greatest sample covariance? What is their sample covariance? Create a scatter plot of these range-normalized attributes.¶

Which Z-score-normalized numerical attributes have the greatest correlation? What is their correlation? Create a scatter plot of these Z-score-normalized attributes.¶

Which Z-score-normalized numerical attributes have the smallest correlation? What is their correlation? Create a scatter plot of these Z-score-normalized attributes.¶

How many pairs of features have correlation greater than or equal to 0.5?¶

How many pairs of features have negative sample covariance?¶

What is the total variance of the data?¶

What is the total variance of the data, restricted to the five features that have the greatest sample variance?¶

Introduction¶

Functions¶

Analyses¶

Preprocessing¶

What is the multivariate mean of the numerical data matrix (where categorical data have been converted to numerical values)?¶

What is the covariance matrix of the numerical data matrix (where categorical data have been converted to numerical values)?¶

Choose 5 pairs of attributes that you think could be related. Create scatter plots of all 5 pairs and include these in your report, along with a description and analysis that summarizes why these pairs of attributes might be related, and how the scatter plots do or do not support this intuition.¶

mpg vs cylinders¶

mpg vs horsepower¶

mpg vs model year¶

mpg vs weight¶

acceleration vs weight¶

Which range-normalized numerical attributes have the greatest sample covariance? What is their sample covariance? Create a scatter plot of these range-normalized attributes.¶

Which Z-score-normalized numerical attributes have the greatest correlation? What is their correlation? Create a scatter plot of these Z-score-normalized attributes.¶

Which Z-score-normalized numerical attributes have the smallest correlation? What is their correlation? Create a scatter plot of these Z-score-normalized attributes.¶

How many pairs of features have correlation greater than or equal to 0.5?¶

How many pairs of features have negative sample covariance?¶

What is the total variance of the data?¶

What is the total variance of the data, restricted to the five features that have the greatest sample variance?¶