commit
f108a703a5
|
@ -1929,6 +1929,10 @@ mode_br.fit(x_train, y_train)
|
||||||
y_test = np.expm1(mode_br.predict(x_test))
|
y_test = np.expm1(mode_br.predict(x_test))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 四 建立模型
|
||||||
|
|
||||||
|
> 模型融合 voting
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# 模型融合
|
# 模型融合
|
||||||
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
|
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
|
||||||
|
@ -1989,15 +1993,3 @@ result['SalePrice'] = ensemble
|
||||||
# index=False 是用来除去行编号
|
# index=False 是用来除去行编号
|
||||||
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
|
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
|
||||||
```
|
```
|
||||||
|
|
||||||
Id SalePrice
|
|
||||||
0 1461 110469.586157
|
|
||||||
1 1462 148368.953437
|
|
||||||
2 1463 172697.673678
|
|
||||||
3 1464 189844.587562
|
|
||||||
4 1465 207009.716532
|
|
||||||
5 1466 188820.407208
|
|
||||||
6 1467 163107.556014
|
|
||||||
7 1468 180732.346459
|
|
||||||
8 1469 194841.804925
|
|
||||||
9 1470 110570.281362
|
|
||||||
|
|
|
@ -14,9 +14,9 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
from sklearn.neighbors import KNeighborsClassifier
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
import sys
|
|
||||||
|
|
||||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
|
||||||
|
|
||||||
|
|
||||||
# 加载数据
|
# 加载数据
|
||||||
def opencsv():
|
def opencsv():
|
||||||
|
@ -31,18 +31,15 @@ def opencsv():
|
||||||
|
|
||||||
|
|
||||||
def saveResult(result, csvName):
|
def saveResult(result, csvName):
|
||||||
with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
|
with open(csvName, 'w') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题)
|
||||||
# python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
|
# python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline=''
|
||||||
myWriter = csv.writer(myFile) # 对文件执行写入
|
myWriter = csv.writer(myFile)
|
||||||
myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名
|
myWriter.writerow(["ImageId", "Label"])
|
||||||
index = 0
|
index = 0
|
||||||
for i in result:
|
for r in result:
|
||||||
tmp = []
|
index += 1
|
||||||
index = index + 1
|
myWriter.writerow([index, int(r)])
|
||||||
tmp.append(index)
|
print('Saved successfully...') # 保存预测结果
|
||||||
# tmp.append(i)
|
|
||||||
tmp.append(int(i)) # 测试集的标签值
|
|
||||||
myWriter.writerow(tmp)
|
|
||||||
|
|
||||||
|
|
||||||
def knnClassify(trainData, trainLabel):
|
def knnClassify(trainData, trainLabel):
|
||||||
|
@ -95,7 +92,7 @@ def dRecognition_knn():
|
||||||
|
|
||||||
# 结果预测
|
# 结果预测
|
||||||
testLabel = knnClf.predict(testData)
|
testLabel = knnClf.predict(testData)
|
||||||
|
|
||||||
# 结果的输出
|
# 结果的输出
|
||||||
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv'))
|
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv'))
|
||||||
print("finish!")
|
print("finish!")
|
||||||
|
|
|
@ -6,7 +6,6 @@ Created on 2017-10-26
|
||||||
Update on 2017-10-26
|
Update on 2017-10-26
|
||||||
Author: 片刻
|
Author: 片刻
|
||||||
Github: https://github.com/apachecn/kaggle
|
Github: https://github.com/apachecn/kaggle
|
||||||
PCA主成成分分析
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os.path
|
import os.path
|
||||||
|
@ -21,7 +20,8 @@ from sklearn.metrics import classification_report
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
# 数据路径
|
# 数据路径
|
||||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
|
||||||
|
|
||||||
|
|
||||||
# 加载数据
|
# 加载数据
|
||||||
def opencsv():
|
def opencsv():
|
||||||
|
@ -61,7 +61,6 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
|
||||||
return pcaTrainData, pcaTestData, pcaPreData
|
return pcaTrainData, pcaTestData, pcaPreData
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 训练模型
|
# 训练模型
|
||||||
def trainModel(trainData, trainLabel):
|
def trainModel(trainData, trainLabel):
|
||||||
print('Train SVM...')
|
print('Train SVM...')
|
||||||
|
@ -85,20 +84,20 @@ def saveResult(result, csvName):
|
||||||
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
|
# 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征)
|
||||||
def analyse_data(dataMat):
|
def analyse_data(dataMat):
|
||||||
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
|
meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals
|
||||||
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
|
meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值
|
||||||
# 计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
|
# 计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计
|
||||||
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
|
# cov(X,0) = cov(X) 除数是n-1(n为样本个数)
|
||||||
# cov(X,1) 除数是n
|
# cov(X,1) 除数是n
|
||||||
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
|
covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值,
|
||||||
# np.mat 是用来生成一个矩阵的
|
# np.mat 是用来生成一个矩阵的
|
||||||
# 保存特征值(eigvals)和对应的特征向量(eigVects)
|
# 保存特征值(eigvals)和对应的特征向量(eigVects)
|
||||||
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
|
eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中
|
||||||
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
|
eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值
|
||||||
|
|
||||||
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
|
topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数
|
||||||
|
|
||||||
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
|
# 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引
|
||||||
eigValInd = eigValInd[:-(topNfeat+1):-1]
|
eigValInd = eigValInd[:-(topNfeat+1):-1]
|
||||||
|
|
||||||
# 计算特征值的总和
|
# 计算特征值的总和
|
||||||
cov_all_score = float(sum(eigvals))
|
cov_all_score = float(sum(eigvals))
|
||||||
|
@ -184,6 +183,7 @@ def getModel(filename):
|
||||||
fr = open(filename, 'rb')
|
fr = open(filename, 'rb')
|
||||||
return pickle.load(fr)
|
return pickle.load(fr)
|
||||||
|
|
||||||
|
|
||||||
def trainDRSVM():
|
def trainDRSVM():
|
||||||
startTime = time.time()
|
startTime = time.time()
|
||||||
|
|
||||||
|
@ -215,6 +215,7 @@ def preDRSVM():
|
||||||
stopTime = time.time()
|
stopTime = time.time()
|
||||||
print('PreModel load time used:%f s' % (stopTime - startTime))
|
print('PreModel load time used:%f s' % (stopTime - startTime))
|
||||||
|
|
||||||
|
|
||||||
# 数据可视化
|
# 数据可视化
|
||||||
def dataVisulization(data, labels):
|
def dataVisulization(data, labels):
|
||||||
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
|
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
|
||||||
|
@ -230,6 +231,7 @@ def dataVisulization(data, labels):
|
||||||
plt.title('MNIST visualization')
|
plt.title('MNIST visualization')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
trainData, trainLabel, preData = opencsv()
|
trainData, trainLabel, preData = opencsv()
|
||||||
dataVisulization(trainData, trainLabel)
|
dataVisulization(trainData, trainLabel)
|
||||||
|
|
Loading…
Reference in New Issue