数据集说明:
Iris数据集也称鸢尾花卉数据集是常用的分类实验数据集,是一类多重变量分析的数据集。数据集包含150个数据集,分为3类,每类50个数据。每个数据包含4个属性:花萼长度,花萼宽度,花瓣长度,花瓣宽度。依据4个属性预测鸢尾花卉属于(Setosa,Versicolour,Virginica)三个种类中的哪一类。</br>
程序说明:采用三种不同的机器学习算法(sklearn KNN、Decision Tree、自实现的KNN)由Python语言实现鸢尾花卉的分类。</br>
算法理论请参照:KNN算法决策树算法</br>
Ipynb演示文件:Ipynb文件</br>
Python代码:Python代码</br>

1
2
3
#This data sets consists of 3 different types of irises’ (Setosa, Versicolour, and Virginica) 
#petal and sepal length, stored in a 150x4 numpy.ndarray
#The rows being the samples and the columns being: Sepal Length, Sepal Width, Petal Length and Petal Width.
1
2
3
4
5
6
7
8
9
10
11
12
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
Y = iris.target

# 获取数据集大小
len(X)

# 随机将数据集划分成成70%训练集,30%测试集。
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#rocky实现的KNN算法

from sklearn import metrics
from collections import Counter
from scipy.spatial import distance

class KNN:
def __init__(self):
self.x_train = None
self.y_train = None

def fit(self, x_train, y_train):
self.x_train = x_train
self.y_train = y_train

def predict(self, x_test, k=3):
predictions = []
for element in x_test:
# 无投票,简单找到离训练集最近的点,返回其label最为预测x的label
label = self.closet(element)

# 投票法
# label = self.vote(element, k)

predictions.append(label)

return predictions

def score(self, x_test, y_test):
return metrics.accuracy_score(y_test, self.predict(x_test))

def closet(self, element):
def enc(a, b):
return distance.euclidean(a, b)

best_dist = enc(element, self.x_train[0])
best_index = 0

for index in range(1, len(self.x_train)):
dist = enc(element, self.x_train[index])
if dist < best_dist:
best_dist = dist
best_index = index

return self.y_train[best_index]

def vote(self, element, k):
def enc(a, b):
return distance.euclidean(a, b)

k_list = []
for index in range(k):
best_dist = enc(element, self.x_train[index])
k_list.append([index, best_dist])

for index in range(k, len(self.x_train)):
dist = enc(element, self.x_train[index])
for i in range(k):
if dist < k_list[i][1]:
k_list.pop(i)
k_list.insert(i, [index, dist])

index_list = []
for index in range(k):
index_list.append(k_list[index][0])

# list with one element, it's a tuple (index, times)
counter = Counter(index_list)
index = counter.most_common(1)[0][0]

return self.y_train[index]

1
2
3
4
5
6
7
8
9
# 选择算法:分别选用KNeighborsClassifier、rocky实现的KNN、DecisionTreeClassifier算法
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

cls_dict = {'Sklearn-KNN': KNeighborsClassifier(),
'Rocky-KNN': KNN(),
'DecisionTree': DecisionTreeClassifier(),
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 训练并测试算法:并序列化,若算法需要调优,可手动删除序列化文件
import pickle

for name, cls in cls_dict.items():
# 训练算法:并序列化,若算法需要调优,可手动删除序列化文件
try:
with open('%s.pickle' % name, 'rb') as f:
cls = pickle.load(f)
except Exception, e:
# 训练算法
cls.fit(X_train, Y_train)
print e

# 序列化算法
with open('%s.pickle' % name, 'wb') as f:
pickle.dump(cls, f)

# 测试算法
print "%s Algorithm Accuracy: %s" % (name, cls.score(X_test, Y_test))