import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


coupon_df = pd.read_csv("in-vehicle-coupon-recommendation.csv")


print(coupon_df.shape)
coupon_df.rename(columns={"passanger":"passenger"}, inplace=True)
print(coupon_df.columns)

(12684, 26)
Index(['destination', 'passenger', 'weather', 'temperature', 'time', 'coupon',
       'expiration', 'gender', 'age', 'maritalStatus', 'has_children',
       'education', 'occupation', 'income', 'car', 'Bar', 'CoffeeHouse',
       'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50',
       'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp', 'Y'],
      dtype='object')


num_unique_list = [len(coupon_df[column].unique()) for column in coupon_df.columns]

print(num_unique_list)
max_index = num_unique_list.index(max(num_unique_list))
print(coupon_df[coupon_df.columns[max_index]].unique())

[3, 4, 3, 3, 5, 5, 2, 2, 8, 5, 2, 6, 25, 9, 6, 6, 6, 6, 6, 6, 1, 2, 2, 2, 2, 2]
['Unemployed' 'Architecture & Engineering' 'Student'
 'Education&Training&Library' 'Healthcare Support'
 'Healthcare Practitioners & Technical' 'Sales & Related' 'Management'
 'Arts Design Entertainment Sports & Media' 'Computer & Mathematical'
 'Life Physical Social Science' 'Personal Care & Service'
 'Community & Social Services' 'Office & Administrative Support'
 'Construction & Extraction' 'Legal' 'Retired'
 'Installation Maintenance & Repair' 'Transportation & Material Moving'
 'Business & Financial' 'Protective Service'
 'Food Preparation & Serving Related' 'Production Occupations'
 'Building & Grounds Cleaning & Maintenance' 'Farming Fishing & Forestry']


coupon_df.isnull().sum()

destination                 0
passenger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64


coupon_df.drop(columns=["car","direction_opp","toCoupon_GEQ5min"], inplace=True)
coupon_df.dropna(inplace=True)
print(coupon_df.shape)
print(coupon_df.isnull().sum())

(12079, 23)
destination             0
passenger               0
weather                 0
temperature             0
time                    0
coupon                  0
expiration              0
gender                  0
age                     0
maritalStatus           0
has_children            0
education               0
occupation              0
income                  0
Bar                     0
CoffeeHouse             0
CarryAway               0
RestaurantLessThan20    0
Restaurant20To50        0
toCoupon_GEQ15min       0
toCoupon_GEQ25min       0
direction_same          0
Y                       0
dtype: int64


fig = plt.figure(figsize=(10,5))
plt.title("Distribution of Answers")
freq_counts = []
for col in coupon_df.columns:
    freq_counts += [list(coupon_df[col].value_counts().to_numpy())]
a = np.zeros((len(coupon_df.columns),max(num_unique_list)))
for i, i_ in enumerate(freq_counts):
    for j, j_ in enumerate(i_):
        a[i][j] = j_
bottoms_height = np.zeros((len(a)))
for i in range(len(a)):
    plt.bar(coupon_df.columns, a[:,i], bottom=bottoms_height)
    bottoms_height += a[:,i]
plt.xticks(rotation=90)
plt.show()


coupon_answers = coupon_df['Y']
coupon_df.drop(columns=["Y"], inplace=True)

print(f"{len(np.where(coupon_answers.to_numpy() == 0)[0])} No's | ", end="")
print(f"{len(np.where(coupon_answers.to_numpy() == 1)[0])} Yes's")

5202 No's | 6877 Yes's


from sklearn import preprocessing
coupon_encoded = np.zeros((coupon_df.shape))
label_encodes = {}
for i in range(coupon_df.shape[1]):
    label_encodes[i]=preprocessing.LabelEncoder()
    coupon_encoded[:,i] = label_encodes[i].fit_transform(coupon_df.to_numpy()[:,i])


from sklearn.decomposition import PCA
pca = PCA()
pca_coupon_encoded = pca.fit_transform(coupon_encoded)


plt.plot(range(1,coupon_df.shape[1]+1), np.cumsum(pca.explained_variance_ratio_), marker='x')
plt.title('Coupon Data: fraction of total variance preserved by principal components')
plt.xlabel('r : the number of principal components')
plt.ylabel('f(r) : fraction of total variance preserved')
plt.show()


sum = 0
perc_val = 0.95
for index, value in enumerate(pca.explained_variance_ratio_):
    sum += value
    if sum >= perc_val:
        print(f"We will use {index + 1} principal components to preserve {perc_val*100}% of the variance")
        num_to_conserve = index + 1
        break
pca_coupon_encoded = pca_coupon_encoded[:,0:num_to_conserve]

We will use 11 principal components to preserve 95.0% of the variance


plt.scatter(pca_coupon_encoded[:,0], pca_coupon_encoded[:,1])
plt.title("pca_trasformed 1st and 2nd attributes")
plt.show()


from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(coupon_encoded,coupon_answers,test_size=0.2, random_state=1)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(pca_coupon_encoded,coupon_answers,test_size=0.2, random_state=1)


print(f"{len(np.where(y_train == 0)[0])} No's | ", end="")
print(f"{len(np.where(y_train == 1)[0])} Yes's")
print(len(np.where(np.array(y_test_pca == y_test)==False)[0]))
print("Zero above means pca'd and regular data are test splited the same")

4148 No's | 5515 Yes's
0
Zero above means pca'd and regular data are test splited the same


from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import sklearn.naive_bayes as naive_bayes
import time


classifiers = {
    "CategoricalNB": naive_bayes.CategoricalNB(), 
    "BernoulliNB": naive_bayes.BernoulliNB(), 
    "MultinomialNB":naive_bayes.MultinomialNB(), 
    "ComplementNB": naive_bayes.ComplementNB(), 
    "GaussianNB": naive_bayes.GaussianNB(),
    "Nearest Neighbors":KNeighborsClassifier(3),
    "Linear SVM":SVC(kernel="linear", C=0.025),
    "RBF SVM":SVC(gamma=2, C=1),
    "Gaussian Process":GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net":MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost":AdaBoostClassifier(),
    "QDA":QuadraticDiscriminantAnalysis(),
    }


for name, classifier in classifiers.items():
    t = time.time()
    print(f"Training {name}", end="")
    classifier.fit(X_train, y_train)
    print(f" {time.time() - t} seconds")
    t = time.time()

Training CategoricalNB 0.008888959884643555 seconds
Training BernoulliNB 0.0041315555572509766 seconds
Training MultinomialNB 0.002554655075073242 seconds
Training ComplementNB 0.0021734237670898438 seconds
Training GaussianNB 0.004054546356201172 seconds
Training Nearest Neighbors 0.0011014938354492188 seconds
Training Linear SVM 2.7881505489349365 seconds
Training RBF SVM 7.405120849609375 seconds
Training Gaussian Process 2700.0129697322845 seconds
Training Decision Tree 0.012784242630004883 seconds
Training Random Forest 0.023504972457885742 seconds
Training Neural Net 5.159074068069458 seconds
Training AdaBoost 0.2730677127838135 seconds
Training QDA 0.009956598281860352 seconds


classifiers_pca = {
    "BernoulliNB": naive_bayes.BernoulliNB(),  
    "GaussianNB": naive_bayes.GaussianNB(),
    "Nearest Neighbors":KNeighborsClassifier(3),
    "Linear SVM":SVC(kernel="linear", C=0.025),
    "RBF SVM":SVC(gamma=2, C=1),
    "Gaussian Process":GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net":MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost":AdaBoostClassifier(),
    "QDA":QuadraticDiscriminantAnalysis(),
    }


for name, classifier in classifiers_pca.items():
    t = time.time()
    print(f"Training {name}", end="")
    classifier.fit(X_train_pca, y_train_pca)
    print(f" {time.time() - t} seconds")
    t = time.time()

Training BernoulliNB 0.0043828487396240234 seconds
Training GaussianNB 0.0030443668365478516 seconds
Training Nearest Neighbors 0.009742021560668945 seconds
Training Linear SVM 2.473268508911133 seconds
Training RBF SVM 6.5074052810668945 seconds
Training Gaussian Process 1827.0859286785126 seconds
Training Decision Tree 0.05810952186584473 seconds
Training Random Forest 0.04308319091796875 seconds
Training Neural Net 7.500874996185303 seconds
Training AdaBoost 0.635845422744751 seconds
Training QDA 0.003890514373779297 seconds


import sklearn.metrics as metrics


acc_scores = {
    "accuracy" :metrics.accuracy_score, 
    "balanced_accuracy":metrics.balanced_accuracy_score,
    "average_precision":metrics.average_precision_score,
    "neg_brier": metrics.brier_score_loss, # smaller is better, mse based
    "jaccard": metrics.jaccard_score,
    "f1_score":metrics.f1_score, 
    "classification_report":metrics.classification_report,
    "roc_aud":metrics.roc_auc_score,
    }


predictions_dict = {}
accuracy_score_dict = {}
for name, classifier in classifiers.items():
    predictions_dict[name] = classifier.predict(X_test)
    results_dict = {}
    for acc_name, acc_metric in acc_scores.items():
        results_dict[acc_name] = acc_metric(y_test, predictions_dict[name])
    accuracy_score_dict[name] = results_dict    

predictions_dict_pca = {}
accuracy_score_dict_pca = {}
for name, classifier in classifiers_pca.items():
    predictions_dict_pca[name] = classifier.predict(X_test_pca)
    results_dict = {}
    for acc_name, acc_metric in acc_scores.items():
        results_dict[acc_name] = acc_metric(y_test_pca, predictions_dict_pca[name])
    accuracy_score_dict_pca[name] = results_dict


print(acc_scores.keys())
acc_scores_no_class = list(acc_scores.keys())
acc_scores_no_class.pop(acc_scores_no_class.index('classification_report'))
print(acc_scores_no_class)

dict_keys(['accuracy', 'balanced_accuracy', 'average_precision', 'neg_brier', 'jaccard', 'f1_score', 'classification_report', 'roc_aud'])
['accuracy', 'balanced_accuracy', 'average_precision', 'neg_brier', 'jaccard', 'f1_score', 'roc_aud']


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular Graph")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers.keys():
        score_list += [accuracy_score_dict[name][score]]
    plt.plot(classifiers.keys(), score_list)
    for x,y in zip(classifiers.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


for name, scores in accuracy_score_dict.items():
    print()
    print(name)
    print(scores['classification_report'])

CategoricalNB
              precision    recall  f1-score   support

           0       0.61      0.56      0.58      1054
           1       0.68      0.72      0.70      1362

    accuracy                           0.65      2416
   macro avg       0.65      0.64      0.64      2416
weighted avg       0.65      0.65      0.65      2416


BernoulliNB
              precision    recall  f1-score   support

           0       0.59      0.39      0.47      1054
           1       0.63      0.79      0.70      1362

    accuracy                           0.62      2416
   macro avg       0.61      0.59      0.58      2416
weighted avg       0.61      0.62      0.60      2416


MultinomialNB
              precision    recall  f1-score   support

           0       0.57      0.41      0.48      1054
           1       0.63      0.76      0.69      1362

    accuracy                           0.61      2416
   macro avg       0.60      0.59      0.58      2416
weighted avg       0.60      0.61      0.60      2416


ComplementNB
              precision    recall  f1-score   support

           0       0.54      0.59      0.57      1054
           1       0.66      0.61      0.64      1362

    accuracy                           0.61      2416
   macro avg       0.60      0.60      0.60      2416
weighted avg       0.61      0.61      0.61      2416


GaussianNB
              precision    recall  f1-score   support

           0       0.56      0.38      0.45      1054
           1       0.62      0.77      0.69      1362

    accuracy                           0.60      2416
   macro avg       0.59      0.58      0.57      2416
weighted avg       0.59      0.60      0.58      2416


Nearest Neighbors
              precision    recall  f1-score   support

           0       0.59      0.55      0.57      1054
           1       0.67      0.71      0.69      1362

    accuracy                           0.64      2416
   macro avg       0.63      0.63      0.63      2416
weighted avg       0.64      0.64      0.64      2416


Linear SVM
              precision    recall  f1-score   support

           0       0.60      0.41      0.49      1054
           1       0.63      0.79      0.70      1362

    accuracy                           0.62      2416
   macro avg       0.61      0.60      0.59      2416
weighted avg       0.62      0.62      0.61      2416


RBF SVM
              precision    recall  f1-score   support

           0       0.71      0.01      0.02      1054
           1       0.57      1.00      0.72      1362

    accuracy                           0.57      2416
   macro avg       0.64      0.50      0.37      2416
weighted avg       0.63      0.57      0.42      2416


Gaussian Process
              precision    recall  f1-score   support

           0       0.68      0.57      0.62      1054
           1       0.71      0.80      0.75      1362

    accuracy                           0.70      2416
   macro avg       0.70      0.68      0.69      2416
weighted avg       0.70      0.70      0.69      2416


Decision Tree
              precision    recall  f1-score   support

           0       0.67      0.57      0.62      1054
           1       0.70      0.79      0.74      1362

    accuracy                           0.69      2416
   macro avg       0.69      0.68      0.68      2416
weighted avg       0.69      0.69      0.69      2416


Random Forest
              precision    recall  f1-score   support

           0       0.68      0.20      0.31      1054
           1       0.60      0.93      0.73      1362

    accuracy                           0.61      2416
   macro avg       0.64      0.56      0.52      2416
weighted avg       0.63      0.61      0.54      2416


Neural Net
              precision    recall  f1-score   support

           0       0.66      0.60      0.63      1054
           1       0.71      0.77      0.74      1362

    accuracy                           0.69      2416
   macro avg       0.69      0.68      0.68      2416
weighted avg       0.69      0.69      0.69      2416


AdaBoost
              precision    recall  f1-score   support

           0       0.65      0.56      0.60      1054
           1       0.69      0.76      0.73      1362

    accuracy                           0.68      2416
   macro avg       0.67      0.66      0.66      2416
weighted avg       0.67      0.68      0.67      2416


QDA
              precision    recall  f1-score   support

           0       0.61      0.48      0.54      1054
           1       0.65      0.76      0.71      1362

    accuracy                           0.64      2416
   macro avg       0.63      0.62      0.62      2416
weighted avg       0.64      0.64      0.63      2416


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures PCA'd Graph")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca.keys():
        score_list += [accuracy_score_dict_pca[name][score]]
    plt.plot(classifiers_pca.keys(), score_list)
    for x,y in zip(classifiers_pca.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


for name, scores in accuracy_score_dict_pca.items():
    print()
    print(name)
    print(scores['classification_report'])

BernoulliNB
              precision    recall  f1-score   support

           0       0.54      0.31      0.39      1054
           1       0.60      0.80      0.68      1362

    accuracy                           0.58      2416
   macro avg       0.57      0.55      0.54      2416
weighted avg       0.57      0.58      0.55      2416


GaussianNB
              precision    recall  f1-score   support

           0       0.56      0.36      0.44      1054
           1       0.61      0.78      0.69      1362

    accuracy                           0.60      2416
   macro avg       0.59      0.57      0.57      2416
weighted avg       0.59      0.60      0.58      2416


Nearest Neighbors
              precision    recall  f1-score   support

           0       0.60      0.57      0.58      1054
           1       0.68      0.71      0.69      1362

    accuracy                           0.65      2416
   macro avg       0.64      0.64      0.64      2416
weighted avg       0.64      0.65      0.65      2416


Linear SVM
              precision    recall  f1-score   support

           0       0.57      0.29      0.38      1054
           1       0.60      0.83      0.70      1362

    accuracy                           0.59      2416
   macro avg       0.58      0.56      0.54      2416
weighted avg       0.59      0.59      0.56      2416


RBF SVM
              precision    recall  f1-score   support

           0       0.74      0.45      0.56      1054
           1       0.67      0.88      0.76      1362

    accuracy                           0.69      2416
   macro avg       0.71      0.66      0.66      2416
weighted avg       0.70      0.69      0.67      2416


Gaussian Process
              precision    recall  f1-score   support

           0       0.62      0.50      0.55      1054
           1       0.66      0.77      0.71      1362

    accuracy                           0.65      2416
   macro avg       0.64      0.63      0.63      2416
weighted avg       0.65      0.65      0.64      2416


Decision Tree
              precision    recall  f1-score   support

           0       0.61      0.49      0.55      1054
           1       0.66      0.76      0.71      1362

    accuracy                           0.64      2416
   macro avg       0.64      0.63      0.63      2416
weighted avg       0.64      0.64      0.64      2416


Random Forest
              precision    recall  f1-score   support

           0       0.68      0.17      0.28      1054
           1       0.59      0.94      0.73      1362

    accuracy                           0.60      2416
   macro avg       0.64      0.56      0.50      2416
weighted avg       0.63      0.60      0.53      2416


Neural Net
              precision    recall  f1-score   support

           0       0.67      0.55      0.60      1054
           1       0.69      0.79      0.74      1362

    accuracy                           0.68      2416
   macro avg       0.68      0.67      0.67      2416
weighted avg       0.68      0.68      0.68      2416


AdaBoost
              precision    recall  f1-score   support

           0       0.60      0.43      0.50      1054
           1       0.64      0.78      0.70      1362

    accuracy                           0.63      2416
   macro avg       0.62      0.60      0.60      2416
weighted avg       0.62      0.63      0.61      2416


QDA
              precision    recall  f1-score   support

           0       0.57      0.42      0.48      1054
           1       0.63      0.76      0.69      1362

    accuracy                           0.61      2416
   macro avg       0.60      0.59      0.58      2416
weighted avg       0.60      0.61      0.60      2416


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular Minus PCA Graph")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca.keys():
        score_list += [accuracy_score_dict[name][score] - accuracy_score_dict_pca[name][score]]
    plt.plot(classifiers_pca.keys(), score_list)
    for x,y in zip(classifiers_pca.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


print(coupon_encoded.shape)
print(coupon_answers.to_numpy().shape)

print((coupon_joined := np.concatenate((coupon_encoded, np.array([coupon_answers.to_numpy()], dtype=np.int32).T), axis=1)).shape)
print((coupon_pca_joined_reg := np.concatenate((coupon_encoded, np.array([coupon_answers.to_numpy()], dtype=np.int32).T), axis=1)).shape)
print((coupon_pca_joined := np.concatenate((pca_coupon_encoded, np.array([coupon_answers.to_numpy()]).T), axis=1)).shape)

(12079, 22)
(12079,)
(12079, 23)
(12079, 23)
(12079, 12)


from sklearn.neighbors import LocalOutlierFactor


n_neigh = 2
n_neigh_pca = 3

lof = LocalOutlierFactor(n_neighbors = n_neigh)
anomaly_predictions = lof.fit_predict(coupon_joined)
sum_ = np.sum(anomaly_predictions==-1)
print(f"{sum_} annomalies, {sum_ / len(anomaly_predictions)*100:.2f}%")

lof_pca = LocalOutlierFactor(n_neighbors = n_neigh_pca)
anomaly_predictions_pca = lof_pca.fit_predict(coupon_pca_joined)
sum_ = np.sum(anomaly_predictions_pca==-1)
print(f"{sum_} annomalies, {sum_ / len(anomaly_predictions_pca)*100:.2f}%")

138 annomalies, 1.14%
517 annomalies, 4.28%


fig = plt.figure()
plt.suptitle(f"No Contamination Level")
fig.add_subplot(1,3,1)
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == 1, 0], coupon_pca_joined[anomaly_predictions_pca ==1, 1])
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == -1, 0], coupon_pca_joined[anomaly_predictions_pca ==-1, 1], c='r')
plt.title(f"Nearest Neighbors {n_neigh_pca}")

fig.add_subplot(1,3,2)
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == 1, 1], coupon_pca_joined[anomaly_predictions_pca ==1, 2])
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == -1, 1], coupon_pca_joined[anomaly_predictions_pca ==-1, 2], c='r')
plt.title(f"Nearest Neighbors {n_neigh_pca}")

fig.add_subplot(1,3,3)
plt.scatter(coupon_joined[anomaly_predictions == 1, 1], coupon_joined[anomaly_predictions ==1, 2])
plt.scatter(coupon_joined[anomaly_predictions == -1, 1], coupon_joined[anomaly_predictions ==-1, 2], c='r')
plt.title(f"Nearest Neighbors {n_neigh}")

fig.tight_layout()
plt.show()

print(f"reg offset {lof.offset_}")
print(f"pca offset {lof_pca.offset_}")

reg offset -1.5
pca offset -1.5


n_neigh = 2
n_neigh_pca = 3
cont_level = 0.1

lof = LocalOutlierFactor(n_neighbors = n_neigh, contamination=cont_level)
anomaly_predictions = lof.fit_predict(coupon_joined)
sum_ = np.sum(anomaly_predictions==-1)
print(f"{sum_} annomalies, {sum_ / len(anomaly_predictions)*100:.2f}%")

lof_pca = LocalOutlierFactor(n_neighbors = n_neigh_pca, contamination=cont_level)
anomaly_predictions_pca = lof_pca.fit_predict(coupon_pca_joined)
sum_ = np.sum(anomaly_predictions_pca==-1)
print(f"{sum_} annomalies, {sum_ / len(anomaly_predictions_pca)*100:.2f}%")

1207 annomalies, 9.99%
1207 annomalies, 9.99%


fig = plt.figure()
plt.suptitle(f"Contamination {cont_level}")
fig.add_subplot(1,3,1)
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == 1, 0], coupon_pca_joined[anomaly_predictions_pca ==1, 1])
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == -1, 0], coupon_pca_joined[anomaly_predictions_pca ==-1, 1], c='r')
plt.title(f"Nearest Neighbors {n_neigh_pca}")

fig.add_subplot(1,3,2)
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == 1, 1], coupon_pca_joined[anomaly_predictions_pca ==1, 2])
plt.scatter(coupon_pca_joined[anomaly_predictions_pca == -1, 1], coupon_pca_joined[anomaly_predictions_pca ==-1, 2], c='r')
plt.title(f"Nearest Neighbors {n_neigh_pca}")

fig.add_subplot(1,3,3)
plt.scatter(coupon_joined[anomaly_predictions == 1, 1], coupon_joined[anomaly_predictions ==1, 2])
plt.scatter(coupon_joined[anomaly_predictions == -1, 1], coupon_joined[anomaly_predictions ==-1, 2], c='r')
plt.title(f"Nearest Neighbors {n_neigh}")

fig.tight_layout()
plt.show()

print(f"reg offset {lof.offset_}")
print(f"pca offset {lof_pca.offset_}")

reg offset -1.2247448713803517
pca offset -1.2260894481590672


final_anom_list, final_pca_anom_list = [],[]
for i, j in enumerate(anomaly_predictions):
    if j == -1:
        final_anom_list += [i]
for i, j in enumerate(anomaly_predictions_pca):
    if j == -1:
        final_pca_anom_list += [i]


_coupon_joined =np.delete(coupon_joined, final_anom_list, axis=0)
_coupon_pca_joined =np.delete(coupon_pca_joined, final_pca_anom_list, axis=0)
_coupon_pca_joined_reg =np.delete(coupon_joined, final_pca_anom_list, axis=0)
print(_coupon_joined.shape)

(10872, 23)


num_unique_list = [len(set(_coupon_joined[:,column])) for column in range(len(coupon_df.columns))]

fig = plt.figure(figsize=(10,5))
fig.add_subplot(1,1,1)
plt.title("Distribution of Answers Regular vs PCA Anomalies Removed")
freq_counts = []
for col_, col in enumerate(coupon_df.columns):
    freq_counts += [list(np.unique(_coupon_joined[:,col_], return_counts=True))[1]]
a = np.zeros((len(coupon_df.columns),max(num_unique_list)))
for i, i_ in enumerate(freq_counts):
    for j, j_ in enumerate(i_):
        a[i][j] = j_
bottoms_height = np.zeros((len(a)))
for i in range(len(a)):
    plt.bar(coupon_df.columns, a[:,i], bottom=bottoms_height)
    bottoms_height += a[:,i]
plt.xticks(rotation=90)

freq_counts = []
for col_, col in enumerate(coupon_df.columns):
    freq_counts += [list(np.unique(_coupon_pca_joined_reg[:,col_], return_counts=True))[1]]
a = np.zeros((len(coupon_df.columns),max(num_unique_list)))
for i, i_ in enumerate(freq_counts):
    for j, j_ in enumerate(i_):
        a[i][j] = j_
bottoms_height = np.zeros((len(a)))
for i in range(len(a)):
    plt.plot(coupon_df.columns, a[:,i] + bottoms_height, color='black')
    bottoms_height += a[:,i]
plt.xticks(rotation=90)

fig.tight_layout()
plt.show()


# Do the splits, coupon_pca_joined, coupon_pca_joined_reg
coupon_no_anoms, coupon_answers_no_anoms = np.hsplit(_coupon_joined,[coupon_joined.shape[1]-1])
pca_coupon_no_anoms,coupon_answers_pca_no_anoms = np.hsplit(_coupon_pca_joined, [coupon_pca_joined.shape[1]-1])
pca_coupon_no_anoms_reg,coupon_answers_pca_no_anoms_reg = np.hsplit(_coupon_pca_joined_reg, [coupon_pca_joined_reg.shape[1]-1])

coupon_answers_no_anoms = coupon_answers_no_anoms.flatten()
coupon_answers_pca_no_anoms = coupon_answers_pca_no_anoms.flatten()
coupon_answers_pca_no_anoms_reg = coupon_answers_pca_no_anoms_reg.flatten()


print(coupon_no_anoms.shape)
print(coupon_answers_no_anoms.flatten().shape)

(10872, 22)
(10872,)


X_train_no_anoms, X_test_no_anoms, y_train_no_anoms, y_test_no_anoms = train_test_split(coupon_no_anoms,coupon_answers_no_anoms,test_size=0.2, random_state=1)

X_train_pca_no_anoms, X_test_pca_no_anoms, y_train_pca_no_anoms, y_test_pca_no_anoms = train_test_split(pca_coupon_no_anoms,coupon_answers_pca_no_anoms,test_size=0.2, random_state=1)

X_train_no_anoms_pca_reg, X_test_no_anoms_pca_reg, y_train_no_anoms_pca_reg, y_test_no_anoms_pca_reg = train_test_split(pca_coupon_no_anoms_reg,coupon_answers_pca_no_anoms_reg,test_size=0.2, random_state=1)


classifiers_no_anoms = {
    "CategoricalNB": naive_bayes.CategoricalNB(), 
    "BernoulliNB": naive_bayes.BernoulliNB(), 
    "MultinomialNB":naive_bayes.MultinomialNB(), 
    "ComplementNB": naive_bayes.ComplementNB(), 
    "GaussianNB": naive_bayes.GaussianNB(),
    "Nearest Neighbors":KNeighborsClassifier(3),
    "Linear SVM":SVC(kernel="linear", C=0.025),
    "RBF SVM":SVC(gamma=2, C=1),
    "Gaussian Process":GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net":MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost":AdaBoostClassifier(),
    "QDA":QuadraticDiscriminantAnalysis(),
    }


for name, classifier in classifiers_no_anoms.items():
    t = time.time()
    print(f"Training {name}", end="")
    classifier.fit(X_train_no_anoms, y_train_no_anoms)
    print(f" {time.time() - t} seconds")
    t = time.time()

Training CategoricalNB 0.012466669082641602 seconds
Training BernoulliNB 0.005488395690917969 seconds
Training MultinomialNB 0.003419637680053711 seconds
Training ComplementNB 0.00287628173828125 seconds
Training GaussianNB 0.003801584243774414 seconds
Training Nearest Neighbors 0.0013725757598876953 seconds
Training Linear SVM 2.172213077545166 seconds
Training RBF SVM 5.27611231803894 seconds
Training Gaussian Process 1124.3844799995422 seconds
Training Decision Tree 0.011212825775146484 seconds
Training Random Forest 0.019681692123413086 seconds
Training Neural Net 4.713333606719971 seconds
Training AdaBoost 0.2511112689971924 seconds
Training QDA 0.006651878356933594 seconds


classifiers_no_anoms_pca_reg = {
    "CategoricalNB": naive_bayes.CategoricalNB(), 
    "BernoulliNB": naive_bayes.BernoulliNB(), 
    "MultinomialNB":naive_bayes.MultinomialNB(), 
    "ComplementNB": naive_bayes.ComplementNB(), 
    "GaussianNB": naive_bayes.GaussianNB(),
    "Nearest Neighbors":KNeighborsClassifier(3),
    "Linear SVM":SVC(kernel="linear", C=0.025),
    "RBF SVM":SVC(gamma=2, C=1),
    "Gaussian Process":GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net":MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost":AdaBoostClassifier(),
    "QDA":QuadraticDiscriminantAnalysis(),
    }


for name, classifier in classifiers_no_anoms_pca_reg.items():
    t = time.time()
    print(f"Training {name}", end="")
    classifier.fit(X_train_no_anoms_pca_reg, y_train_no_anoms_pca_reg)
    print(f" {time.time() - t} seconds")
    t = time.time()

Training CategoricalNB 0.017866134643554688 seconds
Training BernoulliNB 0.0047016143798828125 seconds
Training MultinomialNB 0.002723217010498047 seconds
Training ComplementNB 0.002734661102294922 seconds
Training GaussianNB 0.0035326480865478516 seconds
Training Nearest Neighbors 0.0011813640594482422 seconds
Training Linear SVM 2.1192402839660645 seconds
Training RBF SVM 5.111222982406616 seconds
Training Gaussian Process 1973.832222700119 seconds
Training Decision Tree 0.011200904846191406 seconds
Training Random Forest 0.02094411849975586 seconds
Training Neural Net 4.286831855773926 seconds
Training AdaBoost 0.26255226135253906 seconds
Training QDA 0.006272554397583008 seconds


classifiers_pca_no_anoms = {
    "BernoulliNB": naive_bayes.BernoulliNB(),  
    "GaussianNB": naive_bayes.GaussianNB(),
    "Nearest Neighbors":KNeighborsClassifier(3),
    "Linear SVM":SVC(kernel="linear", C=0.025),
    "RBF SVM":SVC(gamma=2, C=1),
    "Gaussian Process":GaussianProcessClassifier(1.0 * RBF(1.0)),
    "Decision Tree":DecisionTreeClassifier(max_depth=5),
    "Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "Neural Net":MLPClassifier(alpha=1, max_iter=1000),
    "AdaBoost":AdaBoostClassifier(),
    "QDA":QuadraticDiscriminantAnalysis(),
    }


for name, classifier in classifiers_pca_no_anoms.items():
    t = time.time()
    print(f"Training {name}", end="")
    classifier.fit(X_train_pca_no_anoms, y_train_pca_no_anoms)
    print(f" {time.time() - t} seconds")
    t = time.time()

Training BernoulliNB 0.0052640438079833984 seconds
Training GaussianNB 0.003729104995727539 seconds
Training Nearest Neighbors 0.01152658462524414 seconds
Training Linear SVM 2.123621940612793 seconds
Training RBF SVM 4.75576639175415 seconds
Training Gaussian Process 1104.974004983902 seconds
Training Decision Tree 0.040964603424072266 seconds
Training Random Forest 0.039633989334106445 seconds
Training Neural Net 6.159373998641968 seconds
Training AdaBoost 0.604945182800293 seconds
Training QDA 0.00420379638671875 seconds


predictions_dict_no_anoms = {}
accuracy_score_dict_no_anoms = {}
for name, classifier in classifiers_no_anoms.items():
    predictions_dict_no_anoms[name] = classifier.predict(X_test_no_anoms)
    results_dict = {}
    for acc_name, acc_metric in acc_scores.items():
        results_dict[acc_name] = acc_metric(y_test_no_anoms, predictions_dict_no_anoms[name])
    accuracy_score_dict_no_anoms[name] = results_dict

predictions_dict_no_anoms_pca_reg = {}
accuracy_score_dict_no_anoms_pca_reg = {}
for name, classifier in classifiers_no_anoms_pca_reg.items():
    predictions_dict_no_anoms_pca_reg[name] = classifier.predict(X_test_no_anoms_pca_reg)
    results_dict = {}
    for acc_name, acc_metric in acc_scores.items():
        results_dict[acc_name] = acc_metric(y_test_no_anoms_pca_reg, predictions_dict_no_anoms_pca_reg[name])
    accuracy_score_dict_no_anoms_pca_reg[name] = results_dict

predictions_dict_pca_no_anoms = {}
accuracy_score_dict_pca_no_anoms = {}
for name, classifier in classifiers_pca_no_anoms.items():
    predictions_dict_pca_no_anoms[name] = classifier.predict(X_test_pca_no_anoms)
    results_dict = {}
    for acc_name, acc_metric in acc_scores.items():
        results_dict[acc_name] = acc_metric(y_test_pca_no_anoms, predictions_dict_pca_no_anoms[name])
    accuracy_score_dict_pca_no_anoms[name] = results_dict

/home/benf/venvs/csci347/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/benf/venvs/csci347/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/benf/venvs/csci347/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular Graph with Anomalies Removed")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_no_anoms.keys():
        score_list += [accuracy_score_dict_no_anoms[name][score]]
    plt.plot(classifiers_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


for name, scores in accuracy_score_dict_no_anoms.items():
    print()
    print(name)
    print(scores['classification_report'])

CategoricalNB
              precision    recall  f1-score   support

         0.0       0.62      0.55      0.59       954
         1.0       0.68      0.74      0.71      1221

    accuracy                           0.66      2175
   macro avg       0.65      0.65      0.65      2175
weighted avg       0.66      0.66      0.65      2175


BernoulliNB
              precision    recall  f1-score   support

         0.0       0.62      0.42      0.50       954
         1.0       0.64      0.80      0.71      1221

    accuracy                           0.63      2175
   macro avg       0.63      0.61      0.61      2175
weighted avg       0.63      0.63      0.62      2175


MultinomialNB
              precision    recall  f1-score   support

         0.0       0.57      0.42      0.49       954
         1.0       0.62      0.75      0.68      1221

    accuracy                           0.60      2175
   macro avg       0.59      0.58      0.58      2175
weighted avg       0.60      0.60      0.59      2175


ComplementNB
              precision    recall  f1-score   support

         0.0       0.53      0.59      0.56       954
         1.0       0.65      0.60      0.62      1221

    accuracy                           0.59      2175
   macro avg       0.59      0.59      0.59      2175
weighted avg       0.60      0.59      0.59      2175


GaussianNB
              precision    recall  f1-score   support

         0.0       0.59      0.40      0.48       954
         1.0       0.62      0.78      0.69      1221

    accuracy                           0.61      2175
   macro avg       0.61      0.59      0.58      2175
weighted avg       0.61      0.61      0.60      2175


Nearest Neighbors
              precision    recall  f1-score   support

         0.0       0.60      0.56      0.58       954
         1.0       0.68      0.71      0.69      1221

    accuracy                           0.65      2175
   macro avg       0.64      0.64      0.64      2175
weighted avg       0.64      0.65      0.64      2175


Linear SVM
              precision    recall  f1-score   support

         0.0       0.61      0.44      0.51       954
         1.0       0.64      0.78      0.70      1221

    accuracy                           0.63      2175
   macro avg       0.63      0.61      0.61      2175
weighted avg       0.63      0.63      0.62      2175


RBF SVM
              precision    recall  f1-score   support

         0.0       0.69      0.01      0.02       954
         1.0       0.56      1.00      0.72      1221

    accuracy                           0.56      2175
   macro avg       0.63      0.50      0.37      2175
weighted avg       0.62      0.56      0.41      2175


Gaussian Process
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       954
         1.0       0.56      1.00      0.72      1221

    accuracy                           0.56      2175
   macro avg       0.28      0.50      0.36      2175
weighted avg       0.32      0.56      0.40      2175


Decision Tree
              precision    recall  f1-score   support

         0.0       0.74      0.39      0.51       954
         1.0       0.65      0.90      0.76      1221

    accuracy                           0.67      2175
   macro avg       0.70      0.64      0.63      2175
weighted avg       0.69      0.67      0.65      2175


Random Forest
              precision    recall  f1-score   support

         0.0       0.67      0.17      0.27       954
         1.0       0.59      0.94      0.72      1221

    accuracy                           0.60      2175
   macro avg       0.63      0.55      0.50      2175
weighted avg       0.63      0.60      0.52      2175


Neural Net
              precision    recall  f1-score   support

         0.0       0.67      0.60      0.63       954
         1.0       0.71      0.77      0.74      1221

    accuracy                           0.69      2175
   macro avg       0.69      0.68      0.69      2175
weighted avg       0.69      0.69      0.69      2175


AdaBoost
              precision    recall  f1-score   support

         0.0       0.66      0.57      0.61       954
         1.0       0.69      0.77      0.73      1221

    accuracy                           0.68      2175
   macro avg       0.68      0.67      0.67      2175
weighted avg       0.68      0.68      0.68      2175


QDA
              precision    recall  f1-score   support

         0.0       0.61      0.51      0.56       954
         1.0       0.66      0.75      0.70      1221

    accuracy                           0.64      2175
   macro avg       0.64      0.63      0.63      2175
weighted avg       0.64      0.64      0.64      2175


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular Graph with PCA'd Anomalies Removed")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_no_anoms_pca_reg.keys():
        score_list += [accuracy_score_dict_no_anoms_pca_reg[name][score]]
    plt.plot(classifiers_no_anoms_pca_reg.keys(), score_list)
    for x,y in zip(classifiers_no_anoms_pca_reg.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


for name, scores in accuracy_score_dict_no_anoms_pca_reg.items():
    print()
    print(name)
    print(scores['classification_report'])

CategoricalNB
              precision    recall  f1-score   support

         0.0       0.62      0.56      0.59       941
         1.0       0.69      0.74      0.71      1234

    accuracy                           0.66      2175
   macro avg       0.65      0.65      0.65      2175
weighted avg       0.66      0.66      0.66      2175


BernoulliNB
              precision    recall  f1-score   support

         0.0       0.62      0.43      0.51       941
         1.0       0.65      0.80      0.72      1234

    accuracy                           0.64      2175
   macro avg       0.63      0.62      0.61      2175
weighted avg       0.64      0.64      0.63      2175


MultinomialNB
              precision    recall  f1-score   support

         0.0       0.57      0.43      0.49       941
         1.0       0.63      0.75      0.69      1234

    accuracy                           0.61      2175
   macro avg       0.60      0.59      0.59      2175
weighted avg       0.61      0.61      0.60      2175


ComplementNB
              precision    recall  f1-score   support

         0.0       0.53      0.60      0.56       941
         1.0       0.66      0.60      0.63      1234

    accuracy                           0.60      2175
   macro avg       0.60      0.60      0.60      2175
weighted avg       0.61      0.60      0.60      2175


GaussianNB
              precision    recall  f1-score   support

         0.0       0.59      0.43      0.50       941
         1.0       0.64      0.77      0.70      1234

    accuracy                           0.63      2175
   macro avg       0.62      0.60      0.60      2175
weighted avg       0.62      0.63      0.62      2175


Nearest Neighbors
              precision    recall  f1-score   support

         0.0       0.62      0.62      0.62       941
         1.0       0.71      0.71      0.71      1234

    accuracy                           0.67      2175
   macro avg       0.67      0.67      0.67      2175
weighted avg       0.67      0.67      0.67      2175


Linear SVM
              precision    recall  f1-score   support

         0.0       0.62      0.41      0.50       941
         1.0       0.64      0.81      0.72      1234

    accuracy                           0.64      2175
   macro avg       0.63      0.61      0.61      2175
weighted avg       0.63      0.64      0.62      2175


RBF SVM
              precision    recall  f1-score   support

         0.0       0.89      0.01      0.02       941
         1.0       0.57      1.00      0.73      1234

    accuracy                           0.57      2175
   macro avg       0.73      0.50      0.37      2175
weighted avg       0.71      0.57      0.42      2175


Gaussian Process
              precision    recall  f1-score   support

         0.0       0.68      0.60      0.64       941
         1.0       0.72      0.78      0.75      1234

    accuracy                           0.70      2175
   macro avg       0.70      0.69      0.69      2175
weighted avg       0.70      0.70      0.70      2175


Decision Tree
              precision    recall  f1-score   support

         0.0       0.66      0.57      0.61       941
         1.0       0.70      0.78      0.74      1234

    accuracy                           0.69      2175
   macro avg       0.68      0.67      0.67      2175
weighted avg       0.68      0.69      0.68      2175


Random Forest
              precision    recall  f1-score   support

         0.0       0.70      0.18      0.29       941
         1.0       0.60      0.94      0.73      1234

    accuracy                           0.61      2175
   macro avg       0.65      0.56      0.51      2175
weighted avg       0.65      0.61      0.54      2175


Neural Net
              precision    recall  f1-score   support

         0.0       0.67      0.59      0.63       941
         1.0       0.71      0.78      0.74      1234

    accuracy                           0.70      2175
   macro avg       0.69      0.68      0.69      2175
weighted avg       0.69      0.70      0.69      2175


AdaBoost
              precision    recall  f1-score   support

         0.0       0.65      0.56      0.60       941
         1.0       0.70      0.77      0.73      1234

    accuracy                           0.68      2175
   macro avg       0.67      0.67      0.67      2175
weighted avg       0.68      0.68      0.68      2175


QDA
              precision    recall  f1-score   support

         0.0       0.63      0.50      0.56       941
         1.0       0.67      0.77      0.72      1234

    accuracy                           0.65      2175
   macro avg       0.65      0.64      0.64      2175
weighted avg       0.65      0.65      0.65      2175


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures PCA Graph Anomalies Removed")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca_no_anoms.keys():
        score_list += [accuracy_score_dict_pca_no_anoms[name][score]]
    plt.plot(classifiers_pca_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_pca_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


for name, scores in accuracy_score_dict_pca_no_anoms.items():
    print()
    print(name)
    print(scores['classification_report'])

BernoulliNB
              precision    recall  f1-score   support

         0.0       0.51      0.30      0.38       941
         1.0       0.59      0.78      0.67      1234

    accuracy                           0.57      2175
   macro avg       0.55      0.54      0.53      2175
weighted avg       0.56      0.57      0.55      2175


GaussianNB
              precision    recall  f1-score   support

         0.0       0.54      0.37      0.44       941
         1.0       0.61      0.76      0.68      1234

    accuracy                           0.59      2175
   macro avg       0.58      0.57      0.56      2175
weighted avg       0.58      0.59      0.58      2175


Nearest Neighbors
              precision    recall  f1-score   support

         0.0       0.63      0.64      0.64       941
         1.0       0.72      0.72      0.72      1234

    accuracy                           0.68      2175
   macro avg       0.68      0.68      0.68      2175
weighted avg       0.68      0.68      0.68      2175


Linear SVM
              precision    recall  f1-score   support

         0.0       0.54      0.24      0.33       941
         1.0       0.59      0.85      0.70      1234

    accuracy                           0.58      2175
   macro avg       0.57      0.54      0.51      2175
weighted avg       0.57      0.58      0.54      2175


RBF SVM
              precision    recall  f1-score   support

         0.0       0.74      0.50      0.60       941
         1.0       0.69      0.87      0.77      1234

    accuracy                           0.71      2175
   macro avg       0.72      0.68      0.68      2175
weighted avg       0.72      0.71      0.70      2175


Gaussian Process
              precision    recall  f1-score   support

         0.0       0.65      0.59      0.61       941
         1.0       0.70      0.75      0.73      1234

    accuracy                           0.68      2175
   macro avg       0.67      0.67      0.67      2175
weighted avg       0.68      0.68      0.68      2175


Decision Tree
              precision    recall  f1-score   support

         0.0       0.64      0.25      0.36       941
         1.0       0.61      0.89      0.72      1234

    accuracy                           0.61      2175
   macro avg       0.62      0.57      0.54      2175
weighted avg       0.62      0.61      0.56      2175


Random Forest
              precision    recall  f1-score   support

         0.0       0.62      0.18      0.28       941
         1.0       0.59      0.92      0.72      1234

    accuracy                           0.60      2175
   macro avg       0.61      0.55      0.50      2175
weighted avg       0.61      0.60      0.53      2175


Neural Net
              precision    recall  f1-score   support

         0.0       0.68      0.48      0.56       941
         1.0       0.68      0.83      0.74      1234

    accuracy                           0.68      2175
   macro avg       0.68      0.65      0.65      2175
weighted avg       0.68      0.68      0.66      2175


AdaBoost
              precision    recall  f1-score   support

         0.0       0.60      0.48      0.53       941
         1.0       0.66      0.75      0.70      1234

    accuracy                           0.64      2175
   macro avg       0.63      0.62      0.62      2175
weighted avg       0.63      0.64      0.63      2175


QDA
              precision    recall  f1-score   support

         0.0       0.55      0.41      0.47       941
         1.0       0.62      0.74      0.68      1234

    accuracy                           0.60      2175
   macro avg       0.59      0.58      0.57      2175
weighted avg       0.59      0.60      0.59      2175


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular PCA Anomalies Removed Minus Regular Graph Anomalies Removed")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_no_anoms.keys():
        score_list += [accuracy_score_dict_no_anoms_pca_reg[name][score] - accuracy_score_dict_no_anoms[name][score]]
    plt.plot(classifiers_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular Anomalies Removed Minus PCA Graph Anomalies Removed")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca_no_anoms.keys():
        score_list += [accuracy_score_dict_no_anoms[name][score] - accuracy_score_dict_pca_no_anoms[name][score]]
    plt.plot(classifiers_pca_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_pca_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular PCA Anomalies Removed Minus PCA Graph Anomalies Removed")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca_no_anoms.keys():
        score_list += [accuracy_score_dict_no_anoms_pca_reg[name][score] - accuracy_score_dict_pca_no_anoms[name][score]]
    plt.plot(classifiers_pca_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_pca_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular PCA Anomalies Removed Minus Regular")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca_no_anoms.keys():
        score_list += [accuracy_score_dict_no_anoms_pca_reg[name][score] - accuracy_score_dict[name][score]]
    plt.plot(classifiers_pca_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_pca_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures Regular Anomalies Removed Minus Regular")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca_no_anoms.keys():
        score_list += [accuracy_score_dict_no_anoms[name][score] - accuracy_score_dict[name][score]]
    plt.plot(classifiers_pca_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_pca_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()


fig = plt.figure(figsize=(10,15))
plt.suptitle("Accuracy Measures PCA Anomalies Removed Minus PCA")
for score_, score in enumerate(acc_scores_no_class):
    fig.add_subplot(len(acc_scores.keys()) -1,1,score_ + 1)
    plt.title(score)
    score_list = []
    for name in classifiers_pca_no_anoms.keys():
        score_list += [accuracy_score_dict_pca_no_anoms[name][score] - accuracy_score_dict_pca[name][score]]
    plt.plot(classifiers_pca_no_anoms.keys(), score_list)
    for x,y in zip(classifiers_pca_no_anoms.keys(), score_list):
        label = "{:.2f}".format(y)
        plt.annotate(label, # this is the text
        (x,y), # this is the point to label
        textcoords="offset points", # how to position the text
        xytext=(0,10), # distance from text to points (x,y)
        ha='center') # horizontal alignment can be left, right or center
    plt.xticks(rotation=20)

fig.tight_layout()
plt.show()

Coupon Acceptance Classification and Annomaly Detection¶

Introduction¶

Preprocessing¶

Exploration¶

Removing Nulls¶

Classifications Pre-Anomaly Removal¶

Plots Pre-Anomaly Removal¶

Annomaly Detection and Removal¶

Removing Anomalies¶

Final Splitting¶

Reclassification Post-Anomaly Removal¶

Final Plots Post-Anomaly Removal¶

Graphs Against Pre-Anomaly Removal¶

Summary¶

Data¶

Encoding¶

Classifiers and Metrics¶

Summary of Analyses and Results¶

Pre-Anomaly Removal¶

Anomaly Removal¶

Post-Anomaly Removal¶

Conclusions¶