Merge pull request #229 from jiangzhonglian/master

更新 数字识别  knn 写入方式
This commit is contained in:
片刻 2018-05-24 12:30:51 +08:00 committed by GitHub
commit 79e112411e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 34 deletions

View File

@ -1929,6 +1929,10 @@ mode_br.fit(x_train, y_train)
y_test = np.expm1(mode_br.predict(x_test)) y_test = np.expm1(mode_br.predict(x_test))
``` ```
## 四 建立模型
> 模型融合 voting
```python ```python
# 模型融合 # 模型融合
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
@ -1989,15 +1993,3 @@ result['SalePrice'] = ensemble
# index=False 是用来除去行编号 # index=False 是用来除去行编号
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False) result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
``` ```
Id SalePrice
0 1461 110469.586157
1 1462 148368.953437
2 1463 172697.673678
3 1464 189844.587562
4 1465 207009.716532
5 1466 188820.407208
6 1467 163107.556014
7 1468 180732.346459
8 1469 194841.804925
9 1470 110570.281362

View File

@ -14,9 +14,9 @@ import numpy as np
import pandas as pd import pandas as pd
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
import sys
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/' data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据 # 加载数据
def opencsv(): def opencsv():
@ -31,18 +31,15 @@ def opencsv():
def saveResult(result, csvName): def saveResult(result, csvName):
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题) with open(csvName, 'w') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题)
# python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline='' # python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline=''
myWriter = csv.writer(myFile) # 对文件执行写入 myWriter = csv.writer(myFile)
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名 myWriter.writerow(["ImageId", "Label"])
index = 0 index = 0
for i in result: for r in result:
tmp = [] index += 1
index = index + 1 myWriter.writerow([index, int(r)])
tmp.append(index) print('Saved successfully...') # 保存预测结果
# tmp.append(i)
tmp.append(int(i)) # 测试集的标签值
myWriter.writerow(tmp)
def knnClassify(trainData, trainLabel): def knnClassify(trainData, trainLabel):
@ -95,7 +92,7 @@ def dRecognition_knn():
# 结果预测 # 结果预测
testLabel = knnClf.predict(testData) testLabel = knnClf.predict(testData)
# 结果的输出 # 结果的输出
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv')) saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv'))
print("finish!") print("finish!")

View File

@ -6,7 +6,6 @@ Created on 2017-10-26
Update on 2017-10-26 Update on 2017-10-26
Author: 片刻 Author: 片刻
Github: https://github.com/apachecn/kaggle Github: https://github.com/apachecn/kaggle
PCA主成成分分析
''' '''
import os.path import os.path
@ -21,7 +20,8 @@ from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
# 数据路径 # 数据路径
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/' data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据 # 加载数据
def opencsv(): def opencsv():
@ -61,7 +61,6 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
return pcaTrainData, pcaTestData, pcaPreData return pcaTrainData, pcaTestData, pcaPreData
# 训练模型 # 训练模型
def trainModel(trainData, trainLabel): def trainModel(trainData, trainLabel):
print('Train SVM...') print('Train SVM...')
@ -85,20 +84,20 @@ def saveResult(result, csvName):
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征) # 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
def analyse_data(dataMat): def analyse_data(dataMat):
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值 meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
# 计算协方差矩阵除数n-1是为了得到协方差的 无偏估计 # 计算协方差矩阵除数n-1是为了得到协方差的 无偏估计
# cov(X,0) = cov(X) 除数是n-1(n为样本个数) # cov(X,0) = cov(X) 除数是n-1(n为样本个数)
# cov(X,1) 除数是n # cov(X,1) 除数是n
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值, covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
# np.mat 是用来生成一个矩阵的 # np.mat 是用来生成一个矩阵的
# 保存特征值(eigvals)和对应的特征向量(eigVects) # 保存特征值(eigvals)和对应的特征向量(eigVects)
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中 eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值 eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数 topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值返回其对应的索引 # 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值返回其对应的索引
eigValInd = eigValInd[:-(topNfeat+1):-1] eigValInd = eigValInd[:-(topNfeat+1):-1]
# 计算特征值的总和 # 计算特征值的总和
cov_all_score = float(sum(eigvals)) cov_all_score = float(sum(eigvals))
@ -184,6 +183,7 @@ def getModel(filename):
fr = open(filename, 'rb') fr = open(filename, 'rb')
return pickle.load(fr) return pickle.load(fr)
def trainDRSVM(): def trainDRSVM():
startTime = time.time() startTime = time.time()
@ -215,6 +215,7 @@ def preDRSVM():
stopTime = time.time() stopTime = time.time()
print('PreModel load time used:%f s' % (stopTime - startTime)) print('PreModel load time used:%f s' % (stopTime - startTime))
# 数据可视化 # 数据可视化
def dataVisulization(data, labels): def dataVisulization(data, labels):
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维 pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
@ -230,6 +231,7 @@ def dataVisulization(data, labels):
plt.title('MNIST visualization') plt.title('MNIST visualization')
plt.show() plt.show()
if __name__ == '__main__': if __name__ == '__main__':
trainData, trainLabel, preData = opencsv() trainData, trainLabel, preData = opencsv()
dataVisulization(trainData, trainLabel) dataVisulization(trainData, trainLabel)