Merge pull request #230 from apachecn/dev

定期合并 - Dev
This commit is contained in:
片刻 2018-05-24 12:33:38 +08:00 committed by GitHub
commit f108a703a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 34 deletions

View File

@ -1929,6 +1929,10 @@ mode_br.fit(x_train, y_train)
y_test = np.expm1(mode_br.predict(x_test))
```
## 四 建立模型
> 模型融合 voting
```python
# 模型融合
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
@ -1989,15 +1993,3 @@ result['SalePrice'] = ensemble
# index=False 是用来除去行编号
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
```
Id SalePrice
0 1461 110469.586157
1 1462 148368.953437
2 1463 172697.673678
3 1464 189844.587562
4 1465 207009.716532
5 1466 188820.407208
6 1467 163107.556014
7 1468 180732.346459
8 1469 194841.804925
9 1470 110570.281362

View File

@ -14,9 +14,9 @@ import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import sys
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据
def opencsv():
@ -31,18 +31,15 @@ def opencsv():
def saveResult(result, csvName):
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题)
with open(csvName, 'w') as myFile: # 创建记录输出结果的文件w 和 wb 使用的时候有问题)
# python3里面对 str和bytes类型做了严格的区分不像python2里面某些函数里可以混用。所以用python3来写wirterow时打开文件不要用wb模式只需要使用w模式然后带上newline=''
myWriter = csv.writer(myFile) # 对文件执行写入
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
myWriter = csv.writer(myFile)
myWriter.writerow(["ImageId", "Label"])
index = 0
for i in result:
tmp = []
index = index + 1
tmp.append(index)
# tmp.append(i)
tmp.append(int(i)) # 测试集的标签值
myWriter.writerow(tmp)
for r in result:
index += 1
myWriter.writerow([index, int(r)])
print('Saved successfully...') # 保存预测结果
def knnClassify(trainData, trainLabel):

View File

@ -6,7 +6,6 @@ Created on 2017-10-26
Update on 2017-10-26
Author: 片刻
Github: https://github.com/apachecn/kaggle
PCA主成成分分析
'''
import os.path
@ -21,7 +20,8 @@ from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# 数据路径
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
# 加载数据
def opencsv():
@ -61,7 +61,6 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
return pcaTrainData, pcaTestData, pcaPreData
# 训练模型
def trainModel(trainData, trainLabel):
print('Train SVM...')
@ -184,6 +183,7 @@ def getModel(filename):
fr = open(filename, 'rb')
return pickle.load(fr)
def trainDRSVM():
startTime = time.time()
@ -215,6 +215,7 @@ def preDRSVM():
stopTime = time.time()
print('PreModel load time used:%f s' % (stopTime - startTime))
# 数据可视化
def dataVisulization(data, labels):
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
@ -230,6 +231,7 @@ def dataVisulization(data, labels):
plt.title('MNIST visualization')
plt.show()
if __name__ == '__main__':
trainData, trainLabel, preData = opencsv()
dataVisulization(trainData, trainLabel)