diff --git a/competitions/getting-started/house-price/README.md b/competitions/getting-started/house-price/README.md index e186153..c38e8e3 100644 --- a/competitions/getting-started/house-price/README.md +++ b/competitions/getting-started/house-price/README.md @@ -1929,6 +1929,10 @@ mode_br.fit(x_train, y_train) y_test = np.expm1(mode_br.predict(x_test)) ``` +## 四 建立模型 + +> 模型融合 voting + ```python # 模型融合 class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): @@ -1989,15 +1993,3 @@ result['SalePrice'] = ensemble # index=False 是用来除去行编号 result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False) ``` - - Id SalePrice - 0 1461 110469.586157 - 1 1462 148368.953437 - 2 1463 172697.673678 - 3 1464 189844.587562 - 4 1465 207009.716532 - 5 1466 188820.407208 - 6 1467 163107.556014 - 7 1468 180732.346459 - 8 1469 194841.804925 - 9 1470 110570.281362 diff --git a/src/python/getting-started/digit-recognizer/knn-python3.6.py b/src/python/getting-started/digit-recognizer/knn-python3.6.py index 3973a9e..b9a2261 100644 --- a/src/python/getting-started/digit-recognizer/knn-python3.6.py +++ b/src/python/getting-started/digit-recognizer/knn-python3.6.py @@ -14,9 +14,9 @@ import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.neighbors import KNeighborsClassifier -import sys -data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/' +data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/' + # 加载数据 def opencsv(): @@ -31,18 +31,15 @@ def opencsv(): def saveResult(result, csvName): - with open(csvName, 'w', newline='') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题) + with open(csvName, 'w') as myFile: # 创建记录输出结果的文件(w 和 wb 使用的时候有问题) # python3里面对 str和bytes类型做了严格的区分,不像python2里面某些函数里可以混用。所以用python3来写wirterow时,打开文件不要用wb模式,只需要使用w模式,然后带上newline='' - myWriter = csv.writer(myFile) # 对文件执行写入 - myWriter.writerow(["ImageId", "Label"]) # 设置表格的列名 + myWriter = csv.writer(myFile) + myWriter.writerow(["ImageId", "Label"]) index = 0 - for i in result: - tmp = [] - index = index + 1 - tmp.append(index) - # tmp.append(i) - tmp.append(int(i)) # 测试集的标签值 - myWriter.writerow(tmp) + for r in result: + index += 1 + myWriter.writerow([index, int(r)]) + print('Saved successfully...') # 保存预测结果 def knnClassify(trainData, trainLabel): @@ -95,7 +92,7 @@ def dRecognition_knn(): # 结果预测 testLabel = knnClf.predict(testData) - + # 结果的输出 saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv')) print("finish!") diff --git a/src/python/getting-started/digit-recognizer/svm-python3.6.py b/src/python/getting-started/digit-recognizer/svm-python3.6.py index 2b22b56..7da3a92 100644 --- a/src/python/getting-started/digit-recognizer/svm-python3.6.py +++ b/src/python/getting-started/digit-recognizer/svm-python3.6.py @@ -6,7 +6,6 @@ Created on 2017-10-26 Update on 2017-10-26 Author: 片刻 Github: https://github.com/apachecn/kaggle -PCA主成成分分析 ''' import os.path @@ -21,7 +20,8 @@ from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split # 数据路径 -data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/' +data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/' + # 加载数据 def opencsv(): @@ -61,7 +61,6 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM): return pcaTrainData, pcaTestData, pcaPreData - # 训练模型 def trainModel(trainData, trainLabel): print('Train SVM...') @@ -85,20 +84,20 @@ def saveResult(result, csvName): # 分析数据,看数据是否满足要求(通过这些来检测数据的相关性,考虑在分类的时候提取出重要的特征) def analyse_data(dataMat): meanVals = np.mean(dataMat, axis=0) # np.mean 求出每列的平均值meanVals - meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值 + meanRemoved = dataMat-meanVals # 每一列特征值减去该列的特征值均值 # 计算协方差矩阵,除数n-1是为了得到协方差的 无偏估计 # cov(X,0) = cov(X) 除数是n-1(n为样本个数) # cov(X,1) 除数是n - covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值, + covMat = np.cov(meanRemoved, rowvar=0) # cov 计算协方差的值, # np.mat 是用来生成一个矩阵的 # 保存特征值(eigvals)和对应的特征向量(eigVects) - eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中 - eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值 + eigvals, eigVects = np.linalg.eig(np.mat(covMat)) # linalg.eig 计算的值是矩阵的特征值,保存在对应的矩阵中 + eigValInd = np.argsort(eigvals) # argsort 对特征值进行排序,返回的是数值从小到大的索引值 - topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数 + topNfeat = 100 # 需要保留的特征维度,即要压缩成的维度数 # 从排序后的矩阵最后一个开始自下而上选取最大的N个特征值,返回其对应的索引 - eigValInd = eigValInd[:-(topNfeat+1):-1] + eigValInd = eigValInd[:-(topNfeat+1):-1] # 计算特征值的总和 cov_all_score = float(sum(eigvals)) @@ -184,6 +183,7 @@ def getModel(filename): fr = open(filename, 'rb') return pickle.load(fr) + def trainDRSVM(): startTime = time.time() @@ -215,6 +215,7 @@ def preDRSVM(): stopTime = time.time() print('PreModel load time used:%f s' % (stopTime - startTime)) + # 数据可视化 def dataVisulization(data, labels): pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维 @@ -230,6 +231,7 @@ def dataVisulization(data, labels): plt.title('MNIST visualization') plt.show() + if __name__ == '__main__': trainData, trainLabel, preData = opencsv() dataVisulization(trainData, trainLabel)