Merge pull request #230 from apachecn/dev

定期合并 - Dev
This commit is contained in:
片刻 2018-05-24 12:33:38 +08:00 committed by GitHub
commit f108a703a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 34 deletions

View File

@ -1929,6 +1929,10 @@ mode_br.fit(x_train, y_train)
y_test = np.expm1(mode_br.predict(x_test)) y_test = np.expm1(mode_br.predict(x_test))
``` ```
## 四 建立模型
> 模型融合 voting
```python ```python
# 模型融合 # 模型融合
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
@ -1989,15 +1993,3 @@ result['SalePrice'] = ensemble
# index=False 是用来除去行编号 # index=False 是用来除去行编号
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False) result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
``` ```
Id SalePrice
0 1461 110469.586157
1 1462 148368.953437
2 1463 172697.673678
3 1464 189844.587562
4 1465 207009.716532
5 1466 188820.407208
6 1467 163107.556014
7 1468 180732.346459
8 1469 194841.804925
9 1470 110570.281362

View File

@ -14,9 +14,9 @@ import numpy as np
import pandas as pd import pandas as pd
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
import sys
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/' data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据 # 加载数据
def opencsv(): def opencsv():
@ -31,18 +31,15 @@ def opencsv():
def saveResult(result, csvName): def saveResult(result, csvName):
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题) with open(csvName, 'w') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题)
# python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline='' # python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline=''
myWriter = csv.writer(myFile) # 对文件执行写入 myWriter = csv.writer(myFile)
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名 myWriter.writerow(["ImageId", "Label"])
index = 0 index = 0
for i in result: for r in result:
tmp = [] index += 1
index = index + 1 myWriter.writerow([index, int(r)])
tmp.append(index) print('Saved successfully...') # 保存预测结果
# tmp.append(i)
tmp.append(int(i)) # 测试集的标签值
myWriter.writerow(tmp)
def knnClassify(trainData, trainLabel): def knnClassify(trainData, trainLabel):

View File

@ -6,7 +6,6 @@ Created on 2017-10-26
Update on 2017-10-26 Update on 2017-10-26
Author: 片刻 Author: 片刻
Github: https://github.com/apachecn/kaggle Github: https://github.com/apachecn/kaggle
PCA主成成分分析
''' '''
import os.path import os.path
@ -21,7 +20,8 @@ from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
# 数据路径 # 数据路径
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/' data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据 # 加载数据
def opencsv(): def opencsv():
@ -61,7 +61,6 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
return pcaTrainData, pcaTestData, pcaPreData return pcaTrainData, pcaTestData, pcaPreData
# 训练模型 # 训练模型
def trainModel(trainData, trainLabel): def trainModel(trainData, trainLabel):
print('Train SVM...') print('Train SVM...')
@ -184,6 +183,7 @@ def getModel(filename):
fr = open(filename, 'rb') fr = open(filename, 'rb')
return pickle.load(fr) return pickle.load(fr)
def trainDRSVM(): def trainDRSVM():
startTime = time.time() startTime = time.time()
@ -215,6 +215,7 @@ def preDRSVM():
stopTime = time.time() stopTime = time.time()
print('PreModel load time used:%f s' % (stopTime - startTime)) print('PreModel load time used:%f s' % (stopTime - startTime))
# 数据可视化 # 数据可视化
def dataVisulization(data, labels): def dataVisulization(data, labels):
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维 pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
@ -230,6 +231,7 @@ def dataVisulization(data, labels):
plt.title('MNIST visualization') plt.title('MNIST visualization')
plt.show() plt.show()
if __name__ == '__main__': if __name__ == '__main__':
trainData, trainLabel, preData = opencsv() trainData, trainLabel, preData = opencsv()
dataVisulization(trainData, trainLabel) dataVisulization(trainData, trainLabel)