| #coding=utf-8 import sys import matplotlib.pyplot as plt from numpy import *
#加载数据集,testSet.txt里面有三列,第一二列为输入,第三列为输出 def loadDataSet(): dataMat = \[\]; laberMat = \[\] fr = open('testSet.txt') for line in fr.readlines(): lineArr = line.strip().split(); dataMat.append(\[1.0, float(lineArr\[0\]), float(lineArr\[1\])\]) laberMat.append(int(lineArr\[2\]))
return dataMat, laberMat;
#Logistic函数 def sigmoid(inX): return 1.0 / (1 + exp(-inX))
#梯度上升算法 def gradAscent(dataMatIn, classLabels): #转换成numpy矩阵数据类型 dataMatrix = mat(dataMatIn) labelMat = mat(classLabels).transpose() m,n = shape(dataMatrix) alpha = 0.001 maxCycles = 500 weights = ones((n,1)) #maxCycles次循环,一次使用全部数据更新参数; for k in range(maxCycles): h = sigmoid(dataMatrix * weights) error = (labelMat - h) weights = weights + alpha * dataMatrix.transpose() * error
return weights
#改进的梯度上升算法,每次只取一个数据进行更新 def stocGradAscent0(dataMatrix, classLabels): m,n = shape(dataMatrix) alpha = 0.01 weights = ones(n) #每一个数据更新一次weights for i in range(m): h = sigmoid(sum(dataMatrix\[i\] * weights)) error = classLabels\[i\] - h weights = weights + alpha * error * dataMatrix\[i\]
return weights
#随机梯度上升法,避免了周期性的波动(随机的作用) def stocGradAscent1(dataMatrix, classLabels, numIter=150): m,n = shape(dataMatrix) weights = ones(n) for j in range(numIter): dataIndex = range(m) for i in range(m): alpha = 4/(1.0+j+i) + 0.01 randIndex = int(random.uniform(0, len(dataIndex))) h = sigmoid(sum(dataMatrix\[randIndex\]*weights)) error = classLabels\[randIndex\] - h weights = weights + alpha\*error\*dataMatrix\[randIndex\] del(dataIndex\[randIndex\])
return weights
#随机梯度下降法,注意其中的计算; def stocGradDescent(dataMatrix, classLabels, numIter=150): m,n = shape(dataMatrix) weights = ones(n) for j in range(numIter): dataIndex = range(m) for i in range(m): alpha = 4/(1.0+j+i) + 0.01 randIndex = int(random.uniform(0, len(dataIndex))) h = sigmoid(sum(dataMatrix\[randIndex\]*weights)) error = h - classLabels\[randIndex\] weights = weights - alpha\*error\*dataMatrix\[randIndex\] del(dataIndex\[randIndex\])
return weights
#绘制结果; def plotBestFit(weights): dataMat, labelMat = loadDataSet() dataArr = array(dataMat) n = shape(dataArr)\[0\] xcord1 = \[\]; ycord1 = \[\] xcord2 = \[\]; ycord2 = \[\] for i in range(n): if(int(labelMat\[i\]) == 1): xcord1.append(dataArr\[i,1\]); ycord1.append(dataArr\[i,2\]) else: xcord2.append(dataArr\[i,1\]); ycord2.append(dataArr\[i,2\])
fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') ax.scatter(xcord2, ycord2, s=30, c='green') x = arange(-3.0, 3.0, 0.1) # w0 + w1x1 + w2x2 = 0,将x2看做y轴 y = (-weights\[0\] - weights\[1\]*x) / weights\[2\] ax.plot(x,y) plt.xlabel('X1'); plt.ylabel('X2')
#分类函数 def classifyVector(inX, weights): prob = sigmoid(sum(inX*weights)) if(prob > 0.5): return 1.0 else: return 0.0
#病马的分类训练主程序; def colicTest(): frTrain = open('horseColicTraining.txt') frTest = open('horseColicTest.txt') trainingSet = \[\]; trainingLabels = \[\] for line in frTrain.readlines(): currLine = line.strip().split('\\t') lineArr = \[\] for i in range(21): lineArr.append(float(currLine\[i\])) trainingSet.append(lineArr) trainingLabels.append(float(currLine\[21\]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500) errorCount = 0; numTestVec = 0.0 for line in frTest.readlines(): numTestVec += 1.0 currLine = line.strip().split('\\t') lineArr = \[\] for i in range(21): lineArr.append(float(currLine\[i\]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine\[21\]): errorCount += 1
errorRate = (float(errorCount) / numTestVec) print "The error rate of this test is: %f" % errorRate
return errorRate
#多次测试取平均值; def multiTest(): numTests = 10; errorSum = 0.0 for k in range(numTests): errorSum += colicTest()
print "After %d iterations the average error rate is: % f" % (numTests, errorSum/float(numTests))
if \_\_name\_\_ == '\_\_main\_\_': #multiTest() #example dataArr, labelMat = loadDataSet() #stochastic gradient descent #weights = stocGradDescent(array(dataArr), labelMat) #stochastic gradient ascent weights = stocGradAscent1(array(dataArr), labelMat) plotBestFit(weights)