kaggle/src/python/getting-started/digit-recognizer/svm-python3.6.py

247 lines
8.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
# coding: utf-8
'''
Created on 2017-10-26
Update on 2017-10-26
Author: 片刻
Github: https://github.com/apachecn/kaggle
'''
import os.path
import csv
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# 数据路径
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据
def opencsv():
print('Load Data...')
# 使用 pandas 打开
dataTrain = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
dataPre = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
trainData = dataTrain.values[:, 1:] # 读入全部训练数据
trainLabel = dataTrain.values[:, 0]
preData = dataPre.values[:, :] # 测试全部测试个数据
return trainData, trainLabel, preData
# 数据预处理-降维 PCA主成成分分析
def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
print('dimensionality reduction...')
trainData = np.array(x_train)
testData = np.array(x_test)
preData = np.array(preData)
'''
使用说明https://www.cnblogs.com/pinard/p/6243025.html
n_components>=1
n_components=NUM  设置占特征数量比
0 < n_components < 1
n_components=0.99 设置阈值总方差占比
'''
pca = PCA(n_components=COMPONENT_NUM, whiten=True)
pca.fit(trainData) # Fit the model with X
pcaTrainData = pca.transform(trainData) # Fit the model with X and 在X上完成降维.
pcaTestData = pca.transform(testData) # Fit the model with X and 在X上完成降维.
pcaPreData = pca.transform(preData) # Fit the model with X and 在X上完成降维.
# pca 方差大小、方差占比、特征数量
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n', pca.n_components_)
print(sum(pca.explained_variance_ratio_))
return pcaTrainData, pcaTestData, pcaPreData
# 训练模型
def trainModel(trainData, trainLabel):
print('Train SVM...')
svmClf = SVC(C=4, kernel='rbf')
svmClf.fit(trainData, trainLabel) # 训练SVM
return svmClf
# 结果输出保存
def saveResult(result, csvName):
with open(csvName, 'w') as myFile:
myWriter = csv.writer(myFile)
myWriter.writerow(["ImageId", "Label"])
index = 0
for r in result:
index += 1
myWriter.writerow([index, int(r)])
print('Saved successfully...') # 保存预测结果
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
def analyse_data(dataMat):
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
# 计算协方差矩阵除数n-1是为了得到协方差的 无偏估计
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
# cov(X,1) 除数是n
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
# np.mat 是用来生成一个矩阵的
# 保存特征值(eigvals)和对应的特征向量(eigVects)
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值返回其对应的索引
eigValInd = eigValInd[:-(topNfeat+1):-1]
# 计算特征值的总和
cov_all_score = float(sum(eigvals))
sum_cov_score = 0
for i in range(0, len(eigValInd)):
# 特征值进行相加
line_cov_score = float(eigvals[eigValInd[i]])
sum_cov_score += line_cov_score
'''
我们发现其中有超过20%的特征值都是0。
这就意味着这些特征都是其他特征的副本,也就是说,它们可以通过其他特征来表示,而本身并没有提供额外的信息。
最前面15个值的数量级大于10^5实际上那以后的值都变得非常小。
这就相当于告诉我们只有部分重要特征,重要特征的数目也很快就会下降。
最后我们可能会注意到有一些小的负值他们主要源自数值误差应该四舍五入成0.
'''
print('主成分:%s, 方差占比:%s%%, 累积方差占比:%s%%' % (format(i+1, '2.0f'), format(line_cov_score/cov_all_score*100, '4.2f'), format(sum_cov_score/cov_all_score*100, '4.1f')))
# 找出最高准确率
def getOptimalAccuracy(trainData, trainLabel, preData):
# 分析数据 100个特征左右
# analyse_data(trainData)
x_train, x_test, y_train, y_test = train_test_split(trainData, trainLabel, test_size=0.1)
lineLen, featureLen = np.shape(x_test) # shape 返回矩阵或者数值的长度
# print(lineLen, type(lineLen), featureLen, type(featureLen))
minErr = 1
minSumErr = 0
optimalNum = 1
optimalLabel = []
optimalSVMClf = None
pcaPreDataResult = None
for i in range(30, 45, 1):
# 评估训练结果
pcaTrainData, pcaTestData, pcaPreData = dRCsv(x_train, x_test, preData, i)
svmClf = trainModel(pcaTrainData, y_train)
svmtestLabel = svmClf.predict(pcaTestData)
errArr = np.mat(np.ones((lineLen, 1)))
sumErrArr = errArr[svmtestLabel != y_test].sum()
sumErr = sumErrArr/lineLen
print('i=%s' % i, lineLen, sumErrArr, sumErr)
if sumErr <= minErr:
minErr = sumErr
minSumErr = sumErrArr
optimalNum = i
optimalSVMClf = svmClf
optimalLabel = svmtestLabel
pcaPreDataResult = pcaPreData
print("i=%s >>>>> \t" % i, lineLen, int(minSumErr), 1-minErr)
'''
展现 准确率与召回率
precision 准确率
recall 召回率
f1-score 准确率和召回率的一个综合得分
support 参与比较的数量
参考链接http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
'''
# target_names 以 y的label分类为准
# target_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
target_names = [str(i) for i in list(set(y_test))]
print(target_names)
print(classification_report(y_test, optimalLabel, target_names=target_names))
print("特征数量= %s, 存在最优解:>>> \t" % optimalNum, lineLen, int(minSumErr), 1-minErr)
return optimalSVMClf, pcaPreDataResult
# 存储模型
def storeModel(model, filename):
import pickle
with open(filename, 'wb') as fw:
pickle.dump(model, fw)
# 加载模型
def getModel(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
def trainDRSVM():
startTime = time.time()
# 加载数据
trainData, trainLabel, preData = opencsv()
# 模型训练 (数据预处理-降维)
optimalSVMClf, pcaPreData = getOptimalAccuracy(trainData, trainLabel, preData)
storeModel(optimalSVMClf, os.path.join(data_dir, 'output/Result_sklearn_SVM.model'))
storeModel(pcaPreData, os.path.join(data_dir, 'output/Result_sklearn_SVM.pcaPreData'))
print("finish!")
stopTime = time.time()
print('TrainModel store time used:%f s' % (stopTime - startTime))
def preDRSVM():
startTime = time.time()
# 加载模型和数据
optimalSVMClf = getModel(os.path.join(data_dir, 'output/Result_sklearn_SVM.model'))
pcaPreData = getModel(os.path.join(data_dir, 'output/Result_sklearn_SVM.pcaPreData'))
# 结果预测
testLabel = optimalSVMClf.predict(pcaPreData)
# print("testLabel = %f" % testscore)
# 结果的输出
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_SVM.csv'))
print("finish!")
stopTime = time.time()
print('PreModel load time used:%f s' % (stopTime - startTime))
# 数据可视化
def dataVisulization(data, labels):
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
pca.fit(data)
pcaData = pca.transform(data)
uniqueClasses = set(labels)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
for cClass in uniqueClasses:
plt.scatter(pcaData[labels==cClass, 0], pcaData[labels==cClass, 1])
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')
plt.title('MNIST visualization')
plt.show()
if __name__ == '__main__':
trainData, trainLabel, preData = opencsv()
dataVisulization(trainData, trainLabel)
# 训练并保存模型
#trainDRSVM()
# 分析数据
#analyse_data(trainData)
# 加载预测数据集
#preDRSVM()