commit
8ed040054e
|
@ -44,14 +44,13 @@ print ('the right rate is:',float(rightCount)/len(zeroLable))
|
|||
|
||||
result=clf.predict(data_pca[len(train_data):])
|
||||
|
||||
i=0
|
||||
fw = open("C:\\Users\\312\\Desktop\\digit-recognizer\\result.csv", 'w')
|
||||
with open('C:\\Users\\312\\Desktop\\digit-recognizer\\sample_submission.csv') as pred_file:
|
||||
fw.write('{},{}\n'.format('ImageId', 'Label'))
|
||||
for line in pred_file.readlines()[1:]:
|
||||
splits = line.strip().split(',')
|
||||
fw.write('{},{}\n'.format(splits[0],result[i]))
|
||||
i+=1
|
||||
|
||||
with open("C:\\Users\\312\\Desktop\\digit-recognizer\\result.csv", 'w') as fw:
|
||||
with open('C:\\Users\\312\\Desktop\\digit-recognizer\\sample_submission.csv') as pred_file:
|
||||
fw.write('{},{}\n'.format('ImageId', 'Label'))
|
||||
for i,line in enumerate(pred_file.readlines()[1:]):
|
||||
splits = line.strip().split(',')
|
||||
fw.write('{},{}\n'.format(splits[0],result[i]))
|
||||
```
|
||||
|
||||
> 结果:
|
||||
|
|
|
@ -56,14 +56,12 @@ print ('the right rate is:',float(rightCount)/len(zeroLable))
|
|||
|
||||
result=clf.predict(data_pca[len(train_data):])
|
||||
|
||||
i=0
|
||||
fw = open("C:\\Users\\312\\Desktop\\digit-recognizer\\result.csv", 'w')
|
||||
with open('C:\\Users\\312\\Desktop\\digit-recognizer\\sample_submission.csv') as pred_file:
|
||||
fw.write('{},{}\n'.format('ImageId', 'Label'))
|
||||
for line in pred_file.readlines()[1:]:
|
||||
splits = line.strip().split(',')
|
||||
fw.write('{},{}\n'.format(splits[0],result[i]))
|
||||
i+=1
|
||||
with open("C:\\Users\\312\\Desktop\\digit-recognizer\\result.csv", 'w') as fw:
|
||||
with open('C:\\Users\\312\\Desktop\\digit-recognizer\\sample_submission.csv') as pred_file:
|
||||
fw.write('{},{}\n'.format('ImageId', 'Label'))
|
||||
for i,line in enumerate(pred_file.readlines()[1:]):
|
||||
splits = line.strip().split(',')
|
||||
fw.write('{},{}\n'.format(splits[0],result[i]))
|
||||
```
|
||||
|
||||
> 结果:
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
|
||||
四. 模型融合
|
||||
1. 可以参考泰坦尼克号的简单模型融合方式,通过对模型的对比打分方式选择合适的模型
|
||||
2. 模型融合的参考资料
|
||||
2. 在房价预测里我们使用模型融合的方法来输出结果,最终的效果很好。
|
||||
|
||||
五. 修改特征和模型参数
|
||||
1.可以通过添加或者修改特征,提高模型的上限.
|
||||
|
@ -1827,7 +1827,70 @@ plt.show()
|
|||
|
||||
|
||||

|
||||
|
||||
```python
|
||||
# 模型选择
|
||||
## LASSO Regression :
|
||||
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
|
||||
* Elastic Net Regression
|
||||
ENet = make_pipeline(
|
||||
RobustScaler(), ElasticNet(
|
||||
alpha=0.0005, l1_ratio=.9, random_state=3))
|
||||
Kernel Ridge Regression
|
||||
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
|
||||
## Gradient Boosting Regression
|
||||
GBoost = GradientBoostingRegressor(
|
||||
n_estimators=3000,
|
||||
learning_rate=0.05,
|
||||
max_depth=4,
|
||||
max_features='sqrt',
|
||||
min_samples_leaf=15,
|
||||
min_samples_split=10,
|
||||
loss='huber',
|
||||
random_state=5)
|
||||
## XGboost
|
||||
model_xgb = xgb.XGBRegressor(
|
||||
colsample_bytree=0.4603,
|
||||
gamma=0.0468,
|
||||
learning_rate=0.05,
|
||||
max_depth=3,
|
||||
min_child_weight=1.7817,
|
||||
n_estimators=2200,
|
||||
reg_alpha=0.4640,
|
||||
reg_lambda=0.8571,
|
||||
subsample=0.5213,
|
||||
silent=1,
|
||||
random_state=7,
|
||||
nthread=-1)
|
||||
## lightGBM
|
||||
model_lgb = lgb.LGBMRegressor(
|
||||
objective='regression',
|
||||
num_leaves=5,
|
||||
learning_rate=0.05,
|
||||
n_estimators=720,
|
||||
max_bin=55,
|
||||
bagging_fraction=0.8,
|
||||
bagging_freq=5,
|
||||
feature_fraction=0.2319,
|
||||
feature_fraction_seed=9,
|
||||
bagging_seed=9,
|
||||
min_data_in_leaf=6,
|
||||
min_sum_hessian_in_leaf=11)
|
||||
## 对这些基本模型进行打分
|
||||
score = rmsle_cv(lasso)
|
||||
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(ENet)
|
||||
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(KRR)
|
||||
print(
|
||||
"Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(GBoost)
|
||||
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(),
|
||||
score.std()))
|
||||
score = rmsle_cv(model_xgb)
|
||||
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
score = rmsle_cv(model_lgb)
|
||||
print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
|
@ -1867,10 +1930,64 @@ y_test = np.expm1(mode_br.predict(x_test))
|
|||
```
|
||||
|
||||
```python
|
||||
# 提交结果
|
||||
submission_df = pd.DataFrame(data = {'Id':test['Id'],'SalePrice': y_test})
|
||||
print(submission_df.head(10))
|
||||
submission_df.to_csv('/Users/jiangzl/Desktop/submission_br.csv',columns = ['Id','SalePrice'],index = False)
|
||||
# 模型融合
|
||||
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
|
||||
def __init__(self, models):
|
||||
self.models = models
|
||||
|
||||
# we define clones of the original models to fit the data in
|
||||
def fit(self, X, y):
|
||||
self.models_ = [clone(x) for x in self.models]
|
||||
|
||||
# Train cloned base models
|
||||
for model in self.models_:
|
||||
model.fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
# Now we do the predictions for cloned models and average them
|
||||
def predict(self, X):
|
||||
predictions = np.column_stack(
|
||||
[model.predict(X) for model in self.models_])
|
||||
return np.mean(predictions, axis=1)
|
||||
|
||||
|
||||
# 评价这四个模型的好坏
|
||||
averaged_models = AveragingModels(models=(ENet, GBoost, KRR, lasso))
|
||||
score = rmsle_cv(averaged_models)
|
||||
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(),
|
||||
score.std()))
|
||||
|
||||
# 最终对模型的训练和预测
|
||||
# StackedRegressor
|
||||
stacked_averaged_models.fit(train.values, y_train)
|
||||
stacked_train_pred = stacked_averaged_models.predict(train.values)
|
||||
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
|
||||
print(rmsle(y_train, stacked_train_pred))
|
||||
|
||||
# XGBoost
|
||||
model_xgb.fit(train, y_train)
|
||||
xgb_train_pred = model_xgb.predict(train)
|
||||
xgb_pred = np.expm1(model_xgb.predict(test))
|
||||
print(rmsle(y_train, xgb_train_pred))
|
||||
# lightGBM
|
||||
model_lgb.fit(train, y_train)
|
||||
lgb_train_pred = model_lgb.predict(train)
|
||||
lgb_pred = np.expm1(model_lgb.predict(test.values))
|
||||
print(rmsle(y_train, lgb_train_pred))
|
||||
'''RMSE on the entire Train data when averaging'''
|
||||
|
||||
print('RMSLE score on train data:')
|
||||
print(rmsle(y_train, stacked_train_pred * 0.70 + xgb_train_pred * 0.15 +
|
||||
lgb_train_pred * 0.15))
|
||||
# 模型融合的预测效果
|
||||
ensemble = stacked_pred * 0.70 + xgb_pred * 0.15 + lgb_pred * 0.15
|
||||
# 保存结果
|
||||
result = pd.DataFrame()
|
||||
result['Id'] = test_ID
|
||||
result['SalePrice'] = ensemble
|
||||
# index=False 是用来除去行编号
|
||||
result.to_csv('/Users/liudong/Desktop/house_price/result.csv', index=False)
|
||||
```
|
||||
|
||||
Id SalePrice
|
||||
|
|
|
@ -89,7 +89,7 @@ imgs = [os.path.join(root, img) for img in os.listdir(root)]
|
|||
if self.test or not train:
|
||||
self.trainsforms = T.Compose([T.Resize(224), T.CenterCrop(224), T.ToTensor(), normalize])
|
||||
else:
|
||||
# 如果是测试集的话,使用另外的转换
|
||||
# 如果是训练集的话,使用另外的转换
|
||||
self.transforms = T.Compose([T.Resize(256), T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ToTensor(), normalize])
|
||||
```
|
||||
|
||||
|
@ -142,7 +142,7 @@ loader_test = data.DataLoader(test_dataset, batch_size=3, shuffle=True, num_work
|
|||
## 四、构建 CNN 模型
|
||||
|
||||
```python
|
||||
# 调用我们现成的 AlexNet() 模型
|
||||
# 调用已经写好的 AlexNet() 模型
|
||||
cnn = AlexNet()
|
||||
# 将模型打印出来观察一下
|
||||
print(cnn)
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
# [Kaggle](https://www.kaggle.com) 入门操作指南
|
||||
|
||||
## [注册](https://www.kaggle.com/?login=true)
|
||||
|
||||
1. 首先注册账号
|
||||
2. 关联 GitHub 账号
|
||||
|
||||

|
||||
|
||||
## [竞赛 - competitions](https://www.kaggle.com/competitions)
|
||||
|
||||
* [选择 - All 和 Getting Started](https://www.kaggle.com/competitions?sortBy=deadline&group=all&page=1&pageSize=20&segment=gettingStarted)
|
||||
|
||||

|
||||
|
||||
* [选择 - Digit Recognizer(数字识别器)](https://www.kaggle.com/c/digit-recognizer)
|
||||
|
||||

|
||||
|
||||
* [阅读资料 - Digit Recognizer(数字识别器)](https://www.kaggle.com/c/digit-recognizer)
|
||||
|
||||
**选择 版本框架 和 star 最高的 Kernels 编辑就行,然后模仿 [**数字识别**](/competitions/getting-started/digit-recognizer) 案例更新**
|
||||
|
||||

|
||||
|
||||
## 项目规范(以:DigitRecognizer 为例)
|
||||
|
||||
> 文档:结尾文件名为项目名.md
|
||||
|
||||
* 案例:`competitions/getting-started/digit-recognizer.md`
|
||||
* 例如:数字识别,文档是属于 `competitions -> GettingStarted` 下面,所以创建 `competitions/getting-started` 存放文档就行
|
||||
|
||||
> 图片:结尾文件名可自由定义
|
||||
|
||||
* 案例:`static/images/comprtitions/getting-started/digit-recognizer/front_page.png`
|
||||
* 例如:数字识别,图片是属于 `competitions -> GettingStarted -> DigitRecognizer` 下面,所以创建 `competitions/getting-started/digit-recognizer` 存放文档的图片就行
|
||||
|
||||
|
||||
> 代码:结尾文件名可自由定义.py
|
||||
|
||||
* 案例:`src/python/getting-started/digit-recognizer/dr_knn_pandas.py`
|
||||
* 例如:数字识别,代码只有 `竞赛` 有,所以直接创建 `getting-started/digit-recognizer` 存放代码就行
|
||||
* 要求(方法:完全解耦)
|
||||
1. 加载数据
|
||||
2. 预处理数据(可没)
|
||||
3. 训练模型
|
||||
4. 评估模型(可没)
|
||||
5. 导出数据
|
||||
* 标注python和编码格式
|
||||
|
||||
```python
|
||||
#!/usr/bin/python
|
||||
# coding: utf-8
|
||||
```
|
||||
|
||||
* 标注项目的描述
|
||||
|
||||
```python
|
||||
'''
|
||||
Created on 2017-10-26
|
||||
Update on 2017-10-26
|
||||
Author: 【如果是个人】片刻
|
||||
Team: 【如果是团队】装逼从不退缩(张三、李四 等等)
|
||||
Github: https://github.com/apachecn/kaggle
|
||||
'''
|
||||
```
|
||||
|
||||
> 数据:结尾文件名可自由定义
|
||||
|
||||
* 输入:`datasets/getting-started/digit-recognizer/input/train.csv`
|
||||
* 输出:`datasets/getting-started/digit-recognizer/ouput/Result_sklearn_knn.csv`
|
||||
* 例如:数字识别,数据只有 `竞赛` 有,所以直接创建 `getting-started/digit-recognizer` 存放数据就行
|
||||
|
||||
> 结果提交
|
||||
|
||||
将数据的输出结果提交到项目的页面中
|
||||
|
||||
<a href="https://www.kaggle.com/c/digit-recognizer/submit" target="_blank">
|
||||
<img src="../static/images/docs/kaggle-submit.jpg">
|
||||
</a>
|
||||
|
||||
## docs目录(可忽略)
|
||||
|
||||
`docs 目录存放的是 ApacheCN 整理的操作or说明文档,和 kaggle 网站内容没有关系`
|
||||
|
||||
**后面会持续更新**
|
|
@ -2,8 +2,8 @@
|
|||
# coding: utf-8
|
||||
'''
|
||||
Created on 2018-05-14
|
||||
Update on 2018-05-14
|
||||
Author: 平淡的天
|
||||
Update on 2018-05-19
|
||||
Author: 平淡的天/wang-sw
|
||||
Github: https://github.com/apachecn/kaggle
|
||||
'''
|
||||
import os.path
|
||||
|
@ -11,50 +11,153 @@ from sklearn.ensemble import RandomForestClassifier
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.decomposition import PCA
|
||||
import pandas as pd
|
||||
# from sklearn.grid_search import GridSearchCV
|
||||
import numpy as np
|
||||
# from sklearn.model_selection import GridSearchCV
|
||||
# from numpy import arange
|
||||
# from lightgbm import LGBMClassifier
|
||||
data_dir = \
|
||||
r'/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
import os.path
|
||||
import time
|
||||
|
||||
train_data = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
|
||||
test_data = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
|
||||
data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
|
||||
data.drop(['label'], axis=1, inplace=True)
|
||||
label = train_data.label
|
||||
# 数据路径
|
||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
|
||||
pca = PCA(n_components=100, random_state=34)
|
||||
data_pca = pca.fit_transform(data)
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
# 使用 pandas 打开
|
||||
train_data = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
|
||||
test_data = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
|
||||
data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
|
||||
data.drop(['label'], axis=1, inplace=True)
|
||||
label = train_data.label
|
||||
return train_data,test_data,data, label
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
data_pca[0:len(train_data)], label, test_size=0.1, random_state=34)
|
||||
# 数据预处理-降维 PCA主成成分分析
|
||||
def dRPCA(data, COMPONENT_NUM=100):
|
||||
print('dimensionality reduction...')
|
||||
data = np.array(data)
|
||||
'''
|
||||
使用说明:https://www.cnblogs.com/pinard/p/6243025.html
|
||||
n_components>=1
|
||||
n_components=NUM 设置占特征数量
|
||||
0 < n_components < 1
|
||||
n_components=0.99 设置阈值总方差占比
|
||||
'''
|
||||
pca = PCA(n_components=COMPONENT_NUM, random_state=34)
|
||||
data_pca = pca.fit_transform(data)
|
||||
|
||||
clf = RandomForestClassifier(
|
||||
n_estimators=100,
|
||||
max_depth=20,
|
||||
min_samples_split=20,
|
||||
min_samples_leaf=1,
|
||||
random_state=34)
|
||||
# clf=LGBMClassifier(num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
|
||||
# param_test1 = {'n_estimators':arange(10,150,10),'max_depth':arange(1,11,1)}
|
||||
# gsearch1 = GridSearchCV(estimator = clf, param_grid = param_test1, scoring='accuracy',iid=False,cv=5)
|
||||
# gsearch1.fit(Xtrain,xtest)
|
||||
# print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
|
||||
# pca 方差大小、方差占比、特征数量
|
||||
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n',
|
||||
pca.n_components_)
|
||||
print(sum(pca.explained_variance_ratio_))
|
||||
storeModel(data_pca, os.path.join(data_dir, 'output/Result_sklearn_rf.pcaData'))
|
||||
return data_pca
|
||||
|
||||
clf.fit(X_train, y_train)
|
||||
y_predict = clf.predict(X_test)
|
||||
|
||||
zeroLable = y_test - y_predict
|
||||
rightCount = 0
|
||||
for i in range(len(zeroLable)):
|
||||
if list(zeroLable)[i] == 0:
|
||||
rightCount += 1
|
||||
print('the right rate is:', float(rightCount) / len(zeroLable))
|
||||
# 训练模型
|
||||
def trainModel(X_train, y_train):
|
||||
print('Train RF...')
|
||||
clf = RandomForestClassifier(
|
||||
n_estimators=10,
|
||||
max_depth=10,
|
||||
min_samples_split=2,
|
||||
min_samples_leaf=1,
|
||||
random_state=34)
|
||||
clf.fit(X_train, y_train) # 训练rf
|
||||
|
||||
result = clf.predict(data_pca[len(train_data):])
|
||||
# clf=LGBMClassifier(num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
|
||||
|
||||
n, _ = test_data.shape
|
||||
with open(os.path.join(data_dir, 'output/Result_sklearn_RF.csv'), 'w') as fw:
|
||||
fw.write('{},{}\n'.format('ImageId', 'Label'))
|
||||
for i in range(1, n + 1):
|
||||
fw.write('{},{}\n'.format(i, result[i - 1]))
|
||||
# param_test1 = {'n_estimators':arange(10,150,10),'max_depth':arange(1,21,1)}
|
||||
# gsearch1 = GridSearchCV(estimator = clf, param_grid = param_test1, scoring='accuracy',iid=False,cv=5)
|
||||
# gsearch1.fit(X_train, y_train)
|
||||
# print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
|
||||
# clf=gsearch1.best_estimator_
|
||||
|
||||
return clf
|
||||
|
||||
|
||||
# 计算准确率
|
||||
def printAccuracy(y_test ,y_predict):
|
||||
zeroLable = y_test - y_predict
|
||||
rightCount = 0
|
||||
for i in range(len(zeroLable)):
|
||||
if list(zeroLable)[i] == 0:
|
||||
rightCount += 1
|
||||
print('the right rate is:', float(rightCount) / len(zeroLable))
|
||||
|
||||
# 存储模型
|
||||
def storeModel(model, filename):
|
||||
import pickle
|
||||
with open(filename, 'wb') as fw:
|
||||
pickle.dump(model, fw)
|
||||
|
||||
# 加载模型
|
||||
def getModel(filename):
|
||||
import pickle
|
||||
fr = open(filename, 'rb')
|
||||
return pickle.load(fr)
|
||||
|
||||
# 结果输出保存
|
||||
def saveResult(result, csvName):
|
||||
i = 0
|
||||
n = len(result)
|
||||
print('the size of test set is {}'.format(n))
|
||||
with open(os.path.join(data_dir, 'output/Result_sklearn_RF.csv'), 'w') as fw:
|
||||
fw.write('{},{}\n'.format('ImageId', 'Label'))
|
||||
for i in range(1, n + 1):
|
||||
fw.write('{},{}\n'.format(i, result[i - 1]))
|
||||
print('Result saved successfully... and the path = {}'.format(csvName))
|
||||
|
||||
|
||||
def trainRF():
|
||||
start_time = time.time()
|
||||
# 加载数据
|
||||
train_data, test_data, data, label = opencsv()
|
||||
print("load data finish")
|
||||
stop_time_l = time.time()
|
||||
print('load data time used:%f s' % (stop_time_l - start_time))
|
||||
|
||||
startTime = time.time()
|
||||
# 模型训练 (数据预处理-降维)
|
||||
data_pca = dRPCA(data,100)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
data_pca[0:len(train_data)], label, test_size=0.1, random_state=34)
|
||||
|
||||
rfClf = trainModel(X_train, y_train)
|
||||
|
||||
# 保存结果
|
||||
storeModel(data_pca[len(train_data):], os.path.join(data_dir, 'output/Result_sklearn_rf.pcaPreData'))
|
||||
storeModel(rfClf, os.path.join(data_dir, 'output/Result_sklearn_rf.model'))
|
||||
|
||||
# 模型准确率
|
||||
y_predict = rfClf.predict(X_test)
|
||||
printAccuracy(y_test, y_predict)
|
||||
|
||||
print("finish!")
|
||||
stopTime = time.time()
|
||||
print('TrainModel store time used:%f s' % (stopTime - startTime))
|
||||
|
||||
|
||||
def preRF():
|
||||
startTime = time.time()
|
||||
# 加载模型和数据
|
||||
clf=getModel(os.path.join(data_dir, 'output/Result_sklearn_rf.model'))
|
||||
pcaPreData = getModel(os.path.join(data_dir, 'output/Result_sklearn_rf.pcaPreData'))
|
||||
|
||||
# 结果预测
|
||||
result = clf.predict(pcaPreData)
|
||||
|
||||
# 结果的输出
|
||||
saveResult(result, os.path.join(data_dir, 'output/Result_sklearn_rf.csv'))
|
||||
print("finish!")
|
||||
stopTime = time.time()
|
||||
print('PreModel load time used:%f s' % (stopTime - startTime))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# 训练并保存模型
|
||||
trainRF()
|
||||
|
||||
# 加载预测数据集
|
||||
preRF()
|
||||
|
|
Loading…
Reference in New Issue