The Iris Data set was created by R.A. Fisher and is perhaps the best known data set to be found in the pattern recognition literature. Fisher’s paper is a classic in the field and is referenced frequently to this day. The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. Predicted attribute: class of iris plant.

This article aims to classify this dataset utilizing the following models: K Nearest Neighbors (KNN), Naive Bayes and Logistic Regression.
You can find the data source and Iris Dataset Analytics.py here.

Exploring Data

import pandas as pd
import numpy as np

df = pd.read_csv(
    "/Users/qingqiuzhang/Desktop/iris_dataset.csv",
    header = None,
    names  = ["sepal length", "sepal width", "petal length", "petal width", "class"],
)
df.head()
sepal length sepal width petal length petal width class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
Calculate some indicators of features.
df.describe()
sepal length sepal width petal length petal width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

features = ["sepal length", "sepal width", "petal length", "petal width"]
sns.set(style="ticks", palette="pastel")
f, axes = plt.subplots(2, 2, sharey=False, figsize=(14, 14))
for ind, val in enumerate(features):
    sns.violinplot(x="class", y=val, data=df, ax=axes[ind // 2, ind % 2]).set(
        title = "Sepal Length"
    )

plt.show()
sns.pairplot(df, hue="class")

Train and evaluate three models using cross-validation

K-Nearest Neighbors Classifier

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

NUM         = 200
X           = df.drop(["class"], axis=1)
y           = df["class"]
shuffle     = ShuffleSplit(n_splits=NUM, test_size=0.25, random_state=10)
results     = []
klist       = np.arange(1, 21, 1)
FullModel   = Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
param_grid  = {"knn__n_neighbors": klist}
grid_search = GridSearchCV(
    FullModel,
    param_grid,
    scoring            = "accuracy",
    cv                 = shuffle,
    return_train_score = True,
    n_jobs             = -1,
)
grid_search.fit(X, y)
results = pd.DataFrame(grid_search.cv_results_)

print(
    results[
        [
            "rank_test_score",
            "mean_train_score",
            "mean_test_score",
            "param_knn__n_neighbors",
        ]
    ]
)
rank_test_score mean_train_score mean_test_score param_knn__n_neighbors
0 18 1.000000 0.943947 1
1 20 0.970268 0.937368 2
2 17 0.959018 0.945132 3
3 15 0.958214 0.945789 4
4 12 0.963304 0.949737 5
5 9 0.963080 0.952105 6
6 6 0.966741 0.954605 7
7 3 0.962500 0.955395 8
8 7 0.963482 0.954474 9
9 5 0.963571 0.955000 10
10 2 0.965536 0.956447 11
11 1 0.965982 0.958553 12
12 4 0.963973 0.955132 13
13 8 0.963661 0.954079 14
14 11 0.961205 0.950000 15
15 10 0.959107 0.950658 16
16 14 0.957143 0.946711 17
17 13 0.956205 0.947895 18
18 16 0.955223 0.945526 19
19 19 0.953036 0.942895 20
# Plot Accuracy vs K
fig, ax = plt.subplots()
ax.plot(
    results["param_knn__n_neighbors"], results["mean_test_score"], label="test accuracy"
)
ax.set_xlim(15, 0)  # reverse x; from simple model to complex model
# (complex model tries hard to sort of figure out all sorts of details in the data)
ax.set_ylabel("Accuracy")
ax.set_xlabel("n_neighbors")
ax.grid()
ax.legend()

Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
features = ["sepal length", "sepal width", "petal length", "petal width"]

df1 = df.copy()
for column in features:
    df1[column] = [round(i - np.min(df1[column])) for i in df1[column]]
    df1[column] = df1[column].astype("category")
    X = pd.get_dummies(df1[features])

# Calculate mean test accuracy
alphas      = [0.01, 0.1, 1.0, 5.0, 10.0, 15.0, 20.0, 35.0, 50.0]
FullModel   = Pipeline([("scaler", MinMaxScaler()), ("mnb", MultinomialNB())])
param_grid  = {"mnb__alpha": alphas}
grid_search = GridSearchCV(
    FullModel,
    param_grid,
    scoring            = "accuracy",
    cv                 = shuffle,
    return_train_score = True,
    n_jobs             = -1,
)
grid_search.fit(X, y)
results = pd.DataFrame(grid_search.cv_results_)
print(
    results[
        ["rank_test_score", "mean_train_score", "mean_test_score", "param_mnb__alpha"]
    ]
)
rank_test_score mean_train_score mean_test_score param_mnb__alpha
0 9 0.942054 0.931711 0.01
1 8 0.942187 0.932105 0.1
2 7 0.944420 0.935789 1.0
3 5 0.947321 0.942237 5.0
4 3 0.946830 0.945132 10.0
5 1 0.946786 0.946184 15.0
6 2 0.946741 0.946053 20.0
7 4 0.945312 0.944211 35.0
8 6 0.943080 0.941053 50.0

Logistic Regression

from sklearn.linear_model import LogisticRegression

df2 = df.copy()
X   = df2.iloc[:, 0:4].values
y   = df2.iloc[:, 4].values

# Calculate mean test accuracy
Clist     = [0.01, 0.1, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 150.0, 200.0]
FullModel = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(penalty="l2", solver="lbfgs", multi_class="auto")),
    ]
)
param_grid  = {"lr__C": Clist}
grid_search = GridSearchCV(
    FullModel,
    param_grid,
    scoring            = "accuracy",
    cv                 = shuffle,
    return_train_score = True,
    n_jobs             = -1,
)
grid_search.fit(X, y)
results = pd.DataFrame(grid_search.cv_results_)
print(
    results[["rank_test_score", "mean_train_score", "mean_test_score", "param_lr__C"]]
)
rank_test_score mean_train_score mean_test_score param_lr__C
0 10 0.852321 0.834342 0.01
1 9 0.919777 0.908684 0.1
2 8 0.969509 0.960132 1.0
3 7 0.979420 0.968026 5.0
4 4 0.981295 0.969342 10.0
5 3 0.982857 0.969737 20.0
6 1 0.984598 0.970263 50.0
7 2 0.986205 0.969737 100.0
8 5 0.986786 0.969211 150.0
9 6 0.987009 0.968684 200.0

Conclusion

For the current running, Logistic Regression has the highest test accuracy followed by KNN Classifier. Naive Bayes Classifier has the lowest test accuracy.