Merge pull request #195 from jiangzhonglian/master

加入 pca 优化了 knn 运行时间
This commit is contained in:
片刻 2018-05-16 13:47:40 +08:00 committed by GitHub
commit dd20cbde57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 69 deletions

View File

@ -1,46 +0,0 @@
#!/usr/bin/python
# coding: utf-8
'''
Created on 2018-05-16
Update on 2018-05-16
Author: ccyf00
Github: https://github.com/ccyf00/kaggle-1
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
train=pd.read_csv('datasets/getting-started/digit-recognizer/input/train.csv')
test=pd.read_csv('datasets/getting-started/digit-recognizer/input/test.csv')
Y_train=train["label"]
X_train=train.drop(['label'],axis=1)
del train
pca = PCA(n_components=45)
X_train_transformed=pca.fit_transform(X_train)
X_test_transformed=pca.transform(test)
X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(
X_train_transformed, Y_train, test_size=0.1, random_state=13)
components = [10,15,20,25,30,35,40,45]
neighbors = [2,3,4,5,6,7]
scores = np.zeros((components[len(components)-1]+1,neighbors[len(neighbors)-1]+1))
for component in components:
for n in neighbors:
knn=KNeighborsClassifier(n_neighbors=n)
knn.fit(X_train_pca[:,:component],Y_train_pca)
score = knn.score(X_test_pca[:,:component],Y_test_pca)
scores[component][n]=score
print('Components=',component,'neighbors = ',n,'Score = ',score)
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca[:,:35],Y_train_pca)
predictLabel=knn.predict(X_test_transformed[:,:35])
Submission = pd.DataFrame({"ImageId":range(1,predictLabel.shape[0]+1),
"Label":predictLabel})
Submission.to_csv("datasets/getting-started/digit-recognizer/ouput/KnnMnistSubmission.csv", index=False)
#分数0.97385

View File

@ -2,53 +2,79 @@
# coding: utf-8
'''
Created on 2017-10-26
Update on 2017-10-26
Author: 片刻
Update on 2018-05-16
Author: 片刻/ccyf00
Github: https://github.com/apachecn/kaggle
'''
import os.path
import csv
import time
import numpy as np
import pandas as pd
from numpy import shape, ravel
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据
def opencsv():
# 使用 pandas 打开
data = pd.read_csv(
'datasets/getting-started/digit-recognizer/input/train.csv')
data1 = pd.read_csv(
'datasets/getting-started/digit-recognizer/input/test.csv')
data = pd.read_csv(os.path.join(data_dir, 'train.csv'))
data1 = pd.read_csv(os.path.join(data_dir, 'test.csv'))
train_data = data.values[0:, 1:] # 读入全部训练数据, [行,列]
train_label = data.values[0:, 0] # 读取列表的第一列
train_label = data.values[0:, 0] # 读取列表的第一列
test_data = data1.values[0:, 0:] # 测试全部测试个数据
return train_data, train_label, test_data
def saveResult(result, csvName):
with open(csvName, 'w',newline='') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题)
#python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline=''
myWriter = csv.writer(myFile) # 对文件执行写入
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题)
# python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline=''
myWriter = csv.writer(myFile) # 对文件执行写入
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
index = 0
for i in result:
tmp = []
index = index + 1
tmp.append(index)
# tmp.append(i)
tmp.append(int(i)) # 测试集的标签值
tmp.append(int(i)) # 测试集的标签值
myWriter.writerow(tmp)
def knnClassify(trainData, trainLabel):
knnClf = KNeighborsClassifier() # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
knnClf.fit(trainData, ravel(trainLabel)) # ravel Return a contiguous flattened array.
knnClf = KNeighborsClassifier() # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
knnClf.fit(trainData, np.ravel(trainLabel)) # ravel Return a contiguous flattened array.
return knnClf
# 数据预处理-降维 PCA主成成分分析
def dRPCA(x_train, x_test, COMPONENT_NUM):
print('dimensionality reduction...')
trainData = np.array(x_train)
testData = np.array(x_test)
'''
使用说明https://www.cnblogs.com/pinard/p/6243025.html
n_components>=1
n_components=NUM  设置占特征数量比
0 < n_components < 1
n_components=0.99 设置阈值总方差占比
'''
pca = PCA(n_components=COMPONENT_NUM, whiten=True)
pca.fit(trainData) # Fit the model with X
pcaTrainData = pca.transform(trainData) # Fit the model with X and 在X上完成降维.
pcaTestData = pca.transform(testData) # Fit the model with X and 在X上完成降维.
# pca 方差大小、方差占比、特征数量
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n',
pca.n_components_)
print(sum(pca.explained_variance_ratio_))
return pcaTrainData, pcaTestData
def dRecognition_knn():
start_time = time.time()
@ -61,6 +87,9 @@ def dRecognition_knn():
stop_time_l = time.time()
print('load data time used:%f' % (stop_time_l - start_time))
# 降维处理
trainData, testData = dRPCA(trainData, testData, 35)
# 模型训练
knnClf = knnClassify(trainData, trainLabel)
@ -68,10 +97,7 @@ def dRecognition_knn():
testLabel = knnClf.predict(testData)
# 结果的输出
saveResult(
testLabel,
'datasets/getting-started/digit-recognizer/output/Result_sklearn_knn.csv'
)
saveResult(testLabel, os.path.join(data_dir, 'Result_sklearn_knn.csv'))
print("finish!")
stop_time_r = time.time()
print('classify time used:%f' % (stop_time_r - start_time))

View File

@ -79,11 +79,11 @@ def saveResult(result, csvName):
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
def analyse_data(dataMat):
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
#计算协方差矩阵除数n-1是为了得到协方差的 无偏估计
#cov(X,0) = cov(X) 除数是n-1(n为样本个数)
#cov(X,1) 除数是n
# 计算协方差矩阵除数n-1是为了得到协方差的 无偏估计
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
# cov(X,1) 除数是n
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
# np.mat 是用来生成一个矩阵的
# 保存特征值(eigvals)和对应的特征向量(eigVects)