commit
dd20cbde57
|
@ -1,46 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
'''
|
||||
Created on 2018-05-16
|
||||
Update on 2018-05-16
|
||||
Author: ccyf00
|
||||
Github: https://github.com/ccyf00/kaggle-1
|
||||
'''
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
train=pd.read_csv('datasets/getting-started/digit-recognizer/input/train.csv')
|
||||
test=pd.read_csv('datasets/getting-started/digit-recognizer/input/test.csv')
|
||||
Y_train=train["label"]
|
||||
X_train=train.drop(['label'],axis=1)
|
||||
del train
|
||||
|
||||
|
||||
pca = PCA(n_components=45)
|
||||
X_train_transformed=pca.fit_transform(X_train)
|
||||
X_test_transformed=pca.transform(test)
|
||||
X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(
|
||||
X_train_transformed, Y_train, test_size=0.1, random_state=13)
|
||||
components = [10,15,20,25,30,35,40,45]
|
||||
neighbors = [2,3,4,5,6,7]
|
||||
scores = np.zeros((components[len(components)-1]+1,neighbors[len(neighbors)-1]+1))
|
||||
for component in components:
|
||||
for n in neighbors:
|
||||
knn=KNeighborsClassifier(n_neighbors=n)
|
||||
knn.fit(X_train_pca[:,:component],Y_train_pca)
|
||||
score = knn.score(X_test_pca[:,:component],Y_test_pca)
|
||||
scores[component][n]=score
|
||||
print('Components=',component,'neighbors = ',n,'Score = ',score)
|
||||
knn=KNeighborsClassifier(n_neighbors=5)
|
||||
knn.fit(X_train_pca[:,:35],Y_train_pca)
|
||||
predictLabel=knn.predict(X_test_transformed[:,:35])
|
||||
Submission = pd.DataFrame({"ImageId":range(1,predictLabel.shape[0]+1),
|
||||
"Label":predictLabel})
|
||||
Submission.to_csv("datasets/getting-started/digit-recognizer/ouput/KnnMnistSubmission.csv", index=False)
|
||||
#分数:0.97385
|
|
@ -2,53 +2,79 @@
|
|||
# coding: utf-8
|
||||
'''
|
||||
Created on 2017-10-26
|
||||
Update on 2017-10-26
|
||||
Author: 片刻
|
||||
Update on 2018-05-16
|
||||
Author: 片刻/ccyf00
|
||||
Github: https://github.com/apachecn/kaggle
|
||||
'''
|
||||
|
||||
import os.path
|
||||
import csv
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from numpy import shape, ravel
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
|
||||
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
|
||||
|
||||
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
# 使用 pandas 打开
|
||||
data = pd.read_csv(
|
||||
'datasets/getting-started/digit-recognizer/input/train.csv')
|
||||
data1 = pd.read_csv(
|
||||
'datasets/getting-started/digit-recognizer/input/test.csv')
|
||||
data = pd.read_csv(os.path.join(data_dir, 'train.csv'))
|
||||
data1 = pd.read_csv(os.path.join(data_dir, 'test.csv'))
|
||||
|
||||
train_data = data.values[0:, 1:] # 读入全部训练数据, [行,列]
|
||||
train_label = data.values[0:, 0] # 读取列表的第一列
|
||||
train_label = data.values[0:, 0] # 读取列表的第一列
|
||||
test_data = data1.values[0:, 0:] # 测试全部测试个数据
|
||||
return train_data, train_label, test_data
|
||||
|
||||
|
||||
def saveResult(result, csvName):
|
||||
with open(csvName, 'w',newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
|
||||
#python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
|
||||
myWriter = csv.writer(myFile) # 对文件执行写入
|
||||
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
|
||||
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
|
||||
# python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
|
||||
myWriter = csv.writer(myFile) # 对文件执行写入
|
||||
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
|
||||
index = 0
|
||||
for i in result:
|
||||
tmp = []
|
||||
index = index + 1
|
||||
tmp.append(index)
|
||||
# tmp.append(i)
|
||||
tmp.append(int(i)) # 测试集的标签值
|
||||
tmp.append(int(i)) # 测试集的标签值
|
||||
myWriter.writerow(tmp)
|
||||
|
||||
|
||||
def knnClassify(trainData, trainLabel):
|
||||
knnClf = KNeighborsClassifier() # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
|
||||
knnClf.fit(trainData, ravel(trainLabel)) # ravel Return a contiguous flattened array.
|
||||
knnClf = KNeighborsClassifier() # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
|
||||
knnClf.fit(trainData, np.ravel(trainLabel)) # ravel Return a contiguous flattened array.
|
||||
return knnClf
|
||||
|
||||
|
||||
# 数据预处理-降维 PCA主成成分分析
|
||||
def dRPCA(x_train, x_test, COMPONENT_NUM):
|
||||
print('dimensionality reduction...')
|
||||
trainData = np.array(x_train)
|
||||
testData = np.array(x_test)
|
||||
'''
|
||||
使用说明:https://www.cnblogs.com/pinard/p/6243025.html
|
||||
n_components>=1
|
||||
n_components=NUM 设置占特征数量比
|
||||
0 < n_components < 1
|
||||
n_components=0.99 设置阈值总方差占比
|
||||
'''
|
||||
pca = PCA(n_components=COMPONENT_NUM, whiten=True)
|
||||
pca.fit(trainData) # Fit the model with X
|
||||
pcaTrainData = pca.transform(trainData) # Fit the model with X and 在X上完成降维.
|
||||
pcaTestData = pca.transform(testData) # Fit the model with X and 在X上完成降维.
|
||||
|
||||
# pca 方差大小、方差占比、特征数量
|
||||
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n',
|
||||
pca.n_components_)
|
||||
print(sum(pca.explained_variance_ratio_))
|
||||
return pcaTrainData, pcaTestData
|
||||
|
||||
|
||||
def dRecognition_knn():
|
||||
start_time = time.time()
|
||||
|
||||
|
@ -61,6 +87,9 @@ def dRecognition_knn():
|
|||
stop_time_l = time.time()
|
||||
print('load data time used:%f' % (stop_time_l - start_time))
|
||||
|
||||
# 降维处理
|
||||
trainData, testData = dRPCA(trainData, testData, 35)
|
||||
|
||||
# 模型训练
|
||||
knnClf = knnClassify(trainData, trainLabel)
|
||||
|
||||
|
@ -68,10 +97,7 @@ def dRecognition_knn():
|
|||
testLabel = knnClf.predict(testData)
|
||||
|
||||
# 结果的输出
|
||||
saveResult(
|
||||
testLabel,
|
||||
'datasets/getting-started/digit-recognizer/output/Result_sklearn_knn.csv'
|
||||
)
|
||||
saveResult(testLabel, os.path.join(data_dir, 'Result_sklearn_knn.csv'))
|
||||
print("finish!")
|
||||
stop_time_r = time.time()
|
||||
print('classify time used:%f' % (stop_time_r - start_time))
|
||||
|
|
|
@ -79,11 +79,11 @@ def saveResult(result, csvName):
|
|||
|
||||
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
|
||||
def analyse_data(dataMat):
|
||||
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
|
||||
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
|
||||
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
|
||||
#计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
|
||||
#cov(X,0) = cov(X) 除数是n-1(n为样本个数)
|
||||
#cov(X,1) 除数是n
|
||||
# 计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
|
||||
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
|
||||
# cov(X,1) 除数是n
|
||||
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
|
||||
# np.mat 是用来生成一个矩阵的
|
||||
# 保存特征值(eigvals)和对应的特征向量(eigVects)
|
||||
|
|
Loading…
Reference in New Issue