205 lines
5.9 KiB
Python
205 lines
5.9 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
# -*-coding:utf-8 -*-
|
|||
|
from matplotlib.font_manager import FontProperties
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
import numpy as np
|
|||
|
|
|||
|
def loadDataSet(fileName):
|
|||
|
"""
|
|||
|
函数说明:加载数据
|
|||
|
Parameters:
|
|||
|
fileName - 文件名
|
|||
|
Returns:
|
|||
|
xArr - x数据集
|
|||
|
yArr - y数据集
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-11-20
|
|||
|
"""
|
|||
|
numFeat = len(open(fileName).readline().split('\t')) - 1
|
|||
|
xArr = []; yArr = []
|
|||
|
fr = open(fileName)
|
|||
|
for line in fr.readlines():
|
|||
|
lineArr =[]
|
|||
|
curLine = line.strip().split('\t')
|
|||
|
for i in range(numFeat):
|
|||
|
lineArr.append(float(curLine[i]))
|
|||
|
xArr.append(lineArr)
|
|||
|
yArr.append(float(curLine[-1]))
|
|||
|
return xArr, yArr
|
|||
|
|
|||
|
def ridgeRegres(xMat, yMat, lam = 0.2):
|
|||
|
"""
|
|||
|
函数说明:岭回归
|
|||
|
Parameters:
|
|||
|
xMat - x数据集
|
|||
|
yMat - y数据集
|
|||
|
lam - 缩减系数
|
|||
|
Returns:
|
|||
|
ws - 回归系数
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-11-20
|
|||
|
"""
|
|||
|
xTx = xMat.T * xMat
|
|||
|
denom = xTx + np.eye(np.shape(xMat)[1]) * lam
|
|||
|
if np.linalg.det(denom) == 0.0:
|
|||
|
print("矩阵为奇异矩阵,不能求逆")
|
|||
|
return
|
|||
|
ws = denom.I * (xMat.T * yMat)
|
|||
|
return ws
|
|||
|
|
|||
|
def ridgeTest(xArr, yArr):
|
|||
|
"""
|
|||
|
函数说明:岭回归测试
|
|||
|
Parameters:
|
|||
|
xMat - x数据集
|
|||
|
yMat - y数据集
|
|||
|
Returns:
|
|||
|
wMat - 回归系数矩阵
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-11-20
|
|||
|
"""
|
|||
|
xMat = np.mat(xArr); yMat = np.mat(yArr).T
|
|||
|
#数据标准化
|
|||
|
yMean = np.mean(yMat, axis = 0) #行与行操作,求均值
|
|||
|
yMat = yMat - yMean #数据减去均值
|
|||
|
xMeans = np.mean(xMat, axis = 0) #行与行操作,求均值
|
|||
|
xVar = np.var(xMat, axis = 0) #行与行操作,求方差
|
|||
|
xMat = (xMat - xMeans) / xVar #数据减去均值除以方差实现标准化
|
|||
|
numTestPts = 30 #30个不同的lambda测试
|
|||
|
wMat = np.zeros((numTestPts, np.shape(xMat)[1])) #初始回归系数矩阵
|
|||
|
for i in range(numTestPts): #改变lambda计算回归系数
|
|||
|
ws = ridgeRegres(xMat, yMat, np.exp(i - 10)) #lambda以e的指数变化,最初是一个非常小的数,
|
|||
|
wMat[i, :] = ws.T #计算回归系数矩阵
|
|||
|
return wMat
|
|||
|
|
|||
|
def plotwMat():
|
|||
|
"""
|
|||
|
函数说明:绘制岭回归系数矩阵
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-11-20
|
|||
|
"""
|
|||
|
#font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
|
|||
|
abX, abY = loadDataSet('abalone.txt')
|
|||
|
redgeWeights = ridgeTest(abX, abY)
|
|||
|
fig = plt.figure()
|
|||
|
ax = fig.add_subplot(111)
|
|||
|
ax.plot(redgeWeights)
|
|||
|
ax_title_text = ax.set_title(u'log(lambada) and regession')
|
|||
|
ax_xlabel_text = ax.set_xlabel(u'log(lambada)')
|
|||
|
ax_ylabel_text = ax.set_ylabel(u'regression')
|
|||
|
plt.setp(ax_title_text, size = 20, weight = 'bold', color = 'red')
|
|||
|
plt.setp(ax_xlabel_text, size = 10, weight = 'bold', color = 'black')
|
|||
|
plt.setp(ax_ylabel_text, size = 10, weight = 'bold', color = 'black')
|
|||
|
plt.show()
|
|||
|
|
|||
|
|
|||
|
def regularize(xMat, yMat):
|
|||
|
"""
|
|||
|
函数说明:数据标准化
|
|||
|
Parameters:
|
|||
|
xMat - x数据集
|
|||
|
yMat - y数据集
|
|||
|
Returns:
|
|||
|
inxMat - 标准化后的x数据集
|
|||
|
inyMat - 标准化后的y数据集
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-12-03
|
|||
|
"""
|
|||
|
inxMat = xMat.copy() #数据拷贝
|
|||
|
inyMat = yMat.copy()
|
|||
|
yMean = np.mean(yMat, 0) #行与行操作,求均值
|
|||
|
inyMat = yMat - yMean #数据减去均值
|
|||
|
inMeans = np.mean(inxMat, 0) #行与行操作,求均值
|
|||
|
inVar = np.var(inxMat, 0) #行与行操作,求方差
|
|||
|
inxMat = (inxMat - inMeans) / inVar #数据减去均值除以方差实现标准化
|
|||
|
return inxMat, inyMat
|
|||
|
|
|||
|
def rssError(yArr,yHatArr):
|
|||
|
"""
|
|||
|
函数说明:计算平方误差
|
|||
|
Parameters:
|
|||
|
yArr - 预测值
|
|||
|
yHatArr - 真实值
|
|||
|
Returns:
|
|||
|
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-12-03
|
|||
|
"""
|
|||
|
return ((yArr-yHatArr)**2).sum()
|
|||
|
|
|||
|
def stageWise(xArr, yArr, eps = 0.01, numIt = 100):
|
|||
|
"""
|
|||
|
函数说明:前向逐步线性回归
|
|||
|
Parameters:
|
|||
|
xArr - x输入数据
|
|||
|
yArr - y预测数据
|
|||
|
eps - 每次迭代需要调整的步长
|
|||
|
numIt - 迭代次数
|
|||
|
Returns:
|
|||
|
returnMat - numIt次迭代的回归系数矩阵
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-12-03
|
|||
|
"""
|
|||
|
xMat = np.mat(xArr); yMat = np.mat(yArr).T #数据集
|
|||
|
xMat, yMat = regularize(xMat, yMat) #数据标准化
|
|||
|
m, n = np.shape(xMat)
|
|||
|
returnMat = np.zeros((numIt, n)) #初始化numIt次迭代的回归系数矩阵
|
|||
|
ws = np.zeros((n, 1)) #初始化回归系数矩阵
|
|||
|
wsTest = ws.copy()
|
|||
|
wsMax = ws.copy()
|
|||
|
for i in range(numIt): #迭代numIt次
|
|||
|
# print(ws.T) #打印当前回归系数矩阵
|
|||
|
lowestError = float('inf'); #正无穷
|
|||
|
for j in range(n): #遍历每个特征的回归系数
|
|||
|
for sign in [-1, 1]:
|
|||
|
wsTest = ws.copy()
|
|||
|
wsTest[j] += eps * sign #微调回归系数
|
|||
|
yTest = xMat * wsTest #计算预测值
|
|||
|
rssE = rssError(yMat.A, yTest.A) #计算平方误差
|
|||
|
if rssE < lowestError: #如果误差更小,则更新当前的最佳回归系数
|
|||
|
lowestError = rssE
|
|||
|
wsMax = wsTest
|
|||
|
ws = wsMax.copy()
|
|||
|
returnMat[i,:] = ws.T #记录numIt次迭代的回归系数矩阵
|
|||
|
return returnMat
|
|||
|
|
|||
|
def plotstageWiseMat():
|
|||
|
"""
|
|||
|
函数说明:绘制岭回归系数矩阵
|
|||
|
Website:
|
|||
|
http://www.cuijiahua.com/
|
|||
|
Modify:
|
|||
|
2017-12-03
|
|||
|
"""
|
|||
|
#font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
|
|||
|
xArr, yArr = loadDataSet('abalone.txt')
|
|||
|
returnMat = stageWise(xArr, yArr, 0.005, 1000)
|
|||
|
fig = plt.figure()
|
|||
|
ax = fig.add_subplot(111)
|
|||
|
ax.plot(returnMat)
|
|||
|
ax_title_text = ax.set_title(u'front regression')
|
|||
|
ax_xlabel_text = ax.set_xlabel(u'regression')
|
|||
|
ax_ylabel_text = ax.set_ylabel(u'regression coefficient')
|
|||
|
plt.setp(ax_title_text, size = 15, weight = 'bold', color = 'red')
|
|||
|
plt.setp(ax_xlabel_text, size = 10, weight = 'bold', color = 'black')
|
|||
|
plt.setp(ax_ylabel_text, size = 10, weight = 'bold', color = 'black')
|
|||
|
plt.show()
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
plotstageWiseMat()
|