271 lines
7.4 KiB
Python
271 lines
7.4 KiB
Python
|
# -*- coding:UTF-8 -*-
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
import numpy as np
|
|||
|
import random
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:读取数据
|
|||
|
|
|||
|
Parameters:
|
|||
|
fileName - 文件名
|
|||
|
Returns:
|
|||
|
dataMat - 数据矩阵
|
|||
|
labelMat - 数据标签
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-21
|
|||
|
"""
|
|||
|
def loadDataSet(fileName):
|
|||
|
dataMat = []; labelMat = []
|
|||
|
fr = open(fileName)
|
|||
|
for line in fr.readlines(): #逐行读取,滤除空格等
|
|||
|
lineArr = line.strip().split('\t')
|
|||
|
dataMat.append([float(lineArr[0]), float(lineArr[1])]) #添加数据
|
|||
|
labelMat.append(float(lineArr[2])) #添加标签
|
|||
|
return dataMat,labelMat
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:随机选择alpha
|
|||
|
|
|||
|
Parameters:
|
|||
|
i - alpha_i的索引值
|
|||
|
m - alpha参数个数
|
|||
|
Returns:
|
|||
|
j - alpha_j的索引值
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-21
|
|||
|
"""
|
|||
|
def selectJrand(i, m):
|
|||
|
j = i #选择一个不等于i的j
|
|||
|
while (j == i):
|
|||
|
j = int(random.uniform(0, m))
|
|||
|
return j
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:修剪alpha
|
|||
|
|
|||
|
Parameters:
|
|||
|
aj - alpha_j值
|
|||
|
H - alpha上限
|
|||
|
L - alpha下限
|
|||
|
Returns:
|
|||
|
aj - alpah值
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-21
|
|||
|
"""
|
|||
|
def clipAlpha(aj,H,L):
|
|||
|
if aj > H:
|
|||
|
aj = H
|
|||
|
if L > aj:
|
|||
|
aj = L
|
|||
|
return aj
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:数据可视化
|
|||
|
|
|||
|
Parameters:
|
|||
|
dataMat - 数据矩阵
|
|||
|
labelMat - 数据标签
|
|||
|
Returns:
|
|||
|
无
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-21
|
|||
|
"""
|
|||
|
def showDataSet(dataMat, labelMat):
|
|||
|
data_plus = [] #正样本
|
|||
|
data_minus = [] #负样本
|
|||
|
for i in range(len(dataMat)):
|
|||
|
if labelMat[i] > 0:
|
|||
|
data_plus.append(dataMat[i])
|
|||
|
else:
|
|||
|
data_minus.append(dataMat[i])
|
|||
|
data_plus_np = np.array(data_plus) #转换为numpy矩阵
|
|||
|
data_minus_np = np.array(data_minus) #转换为numpy矩阵
|
|||
|
plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图
|
|||
|
plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图
|
|||
|
plt.show()
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:简化版SMO算法
|
|||
|
|
|||
|
Parameters:
|
|||
|
dataMatIn - 数据矩阵
|
|||
|
classLabels - 数据标签
|
|||
|
C - 松弛变量
|
|||
|
toler - 容错率
|
|||
|
maxIter - 最大迭代次数
|
|||
|
Returns:
|
|||
|
无
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-23
|
|||
|
"""
|
|||
|
def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
|
|||
|
#转换为numpy的mat存储
|
|||
|
dataMatrix = np.mat(dataMatIn); labelMat = np.mat(classLabels).transpose()
|
|||
|
#初始化b参数,统计dataMatrix的维度
|
|||
|
b = 0; m,n = np.shape(dataMatrix)
|
|||
|
#初始化alpha参数,设为0
|
|||
|
alphas = np.mat(np.zeros((m,1)))
|
|||
|
#初始化迭代次数
|
|||
|
iter_num = 0
|
|||
|
#最多迭代matIter次
|
|||
|
while (iter_num < maxIter):
|
|||
|
alphaPairsChanged = 0
|
|||
|
for i in range(m):
|
|||
|
#步骤1:计算误差Ei
|
|||
|
fXi = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
|
|||
|
Ei = fXi - float(labelMat[i])
|
|||
|
#优化alpha,设定一定的容错率。
|
|||
|
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
|
|||
|
#随机选择另一个与alpha_i成对优化的alpha_j
|
|||
|
j = selectJrand(i,m)
|
|||
|
#步骤1:计算误差Ej
|
|||
|
fXj = float(np.multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
|
|||
|
Ej = fXj - float(labelMat[j])
|
|||
|
#保存更新前的aplpha值,使用深拷贝
|
|||
|
alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy();
|
|||
|
#步骤2:计算上下界L和H
|
|||
|
if (labelMat[i] != labelMat[j]):
|
|||
|
L = max(0, alphas[j] - alphas[i])
|
|||
|
H = min(C, C + alphas[j] - alphas[i])
|
|||
|
else:
|
|||
|
L = max(0, alphas[j] + alphas[i] - C)
|
|||
|
H = min(C, alphas[j] + alphas[i])
|
|||
|
if L==H: print("L==H"); continue
|
|||
|
#步骤3:计算eta
|
|||
|
eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T
|
|||
|
if eta >= 0: print("eta>=0"); continue
|
|||
|
#步骤4:更新alpha_j
|
|||
|
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
|
|||
|
#步骤5:修剪alpha_j
|
|||
|
alphas[j] = clipAlpha(alphas[j],H,L)
|
|||
|
if (abs(alphas[j] - alphaJold) < 0.00001): print("alpha_j变化太小"); continue
|
|||
|
#步骤6:更新alpha_i
|
|||
|
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])
|
|||
|
#步骤7:更新b_1和b_2
|
|||
|
b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
|
|||
|
b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
|
|||
|
#步骤8:根据b_1和b_2更新b
|
|||
|
if (0 < alphas[i]) and (C > alphas[i]): b = b1
|
|||
|
elif (0 < alphas[j]) and (C > alphas[j]): b = b2
|
|||
|
else: b = (b1 + b2)/2.0
|
|||
|
#统计优化次数
|
|||
|
alphaPairsChanged += 1
|
|||
|
#打印统计信息
|
|||
|
print("第%d次迭代 样本:%d, alpha优化次数:%d" % (iter_num,i,alphaPairsChanged))
|
|||
|
#更新迭代次数
|
|||
|
if (alphaPairsChanged == 0): iter_num += 1
|
|||
|
else: iter_num = 0
|
|||
|
print("迭代次数: %d" % iter_num)
|
|||
|
return b,alphas
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:分类结果可视化
|
|||
|
|
|||
|
Parameters:
|
|||
|
dataMat - 数据矩阵
|
|||
|
w - 直线法向量
|
|||
|
b - 直线解决
|
|||
|
Returns:
|
|||
|
无
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-23
|
|||
|
"""
|
|||
|
def showClassifer(dataMat, w, b):
|
|||
|
#绘制样本点
|
|||
|
data_plus = [] #正样本
|
|||
|
data_minus = [] #负样本
|
|||
|
for i in range(len(dataMat)):
|
|||
|
if labelMat[i] > 0:
|
|||
|
data_plus.append(dataMat[i])
|
|||
|
else:
|
|||
|
data_minus.append(dataMat[i])
|
|||
|
data_plus_np = np.array(data_plus) #转换为numpy矩阵
|
|||
|
data_minus_np = np.array(data_minus) #转换为numpy矩阵
|
|||
|
plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1], s=30, alpha=0.7) #正样本散点图
|
|||
|
plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1], s=30, alpha=0.7) #负样本散点图
|
|||
|
#绘制直线
|
|||
|
x1 = max(dataMat)[0]
|
|||
|
x2 = min(dataMat)[0]
|
|||
|
a1, a2 = w
|
|||
|
b = float(b)
|
|||
|
a1 = float(a1[0])
|
|||
|
a2 = float(a2[0])
|
|||
|
y1, y2 = (-b- a1*x1)/a2, (-b - a1*x2)/a2
|
|||
|
plt.plot([x1, x2], [y1, y2])
|
|||
|
#找出支持向量点
|
|||
|
for i, alpha in enumerate(alphas):
|
|||
|
if abs(alpha) > 0:
|
|||
|
x, y = dataMat[i]
|
|||
|
plt.scatter([x], [y], s=150, c='none', alpha=0.7, linewidth=1.5, edgecolor='red')
|
|||
|
plt.show()
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
函数说明:计算w
|
|||
|
|
|||
|
Parameters:
|
|||
|
dataMat - 数据矩阵
|
|||
|
labelMat - 数据标签
|
|||
|
alphas - alphas值
|
|||
|
Returns:
|
|||
|
无
|
|||
|
Author:
|
|||
|
Jack Cui
|
|||
|
Blog:
|
|||
|
http://blog.csdn.net/c406495762
|
|||
|
Zhihu:
|
|||
|
https://www.zhihu.com/people/Jack--Cui/
|
|||
|
Modify:
|
|||
|
2017-09-23
|
|||
|
"""
|
|||
|
def get_w(dataMat, labelMat, alphas):
|
|||
|
alphas, dataMat, labelMat = np.array(alphas), np.array(dataMat), np.array(labelMat)
|
|||
|
w = np.dot((np.tile(labelMat.reshape(1, -1).T, (1, 2)) * dataMat).T, alphas)
|
|||
|
return w.tolist()
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
dataMat, labelMat = loadDataSet('testSet.txt')
|
|||
|
b,alphas = smoSimple(dataMat, labelMat, 0.6, 0.001, 40)
|
|||
|
w = get_w(dataMat, labelMat, alphas)
|
|||
|
showClassifer(dataMat, w, b)
|
|||
|
|