Merge pull request #218 from wang-sw/dev-1

对rf-python3.6.py进行整理,并调整了模型参数
This commit is contained in:
liudong 2018-05-19 19:57:22 +08:00 committed by GitHub
commit dbb788f453
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 145 additions and 39 deletions

View File

@ -2,8 +2,8 @@
# coding: utf-8
'''
Created on 2018-05-14
Update on 2018-05-14
Author: 平淡的天
Update on 2018-05-19
Author: 平淡的天/wang-sw
Github: https://github.com/apachecn/kaggle
'''
import os.path
@ -11,50 +11,156 @@ from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import pandas as pd
# from sklearn.grid_search import GridSearchCV
import numpy as np
# from sklearn.model_selection import GridSearchCV
# from numpy import arange
# from lightgbm import LGBMClassifier
data_dir = \
r'/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
import os.path
import time
train_data = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
test_data = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
data.drop(['label'], axis=1, inplace=True)
label = train_data.label
# 数据路径
data_dir = '/media/wsw/B634091A3408DF6D/data/kaggle/datasets/getting-started/digit-recognizer/'
pca = PCA(n_components=100, random_state=34)
data_pca = pca.fit_transform(data)
# 加载数据
def opencsv():
# 使用 pandas 打开
train_data = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
test_data = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
data.drop(['label'], axis=1, inplace=True)
label = train_data.label
return train_data,test_data,data, label
X_train, X_test, y_train, y_test = train_test_split(
data_pca[0:len(train_data)], label, test_size=0.1, random_state=34)
# 数据预处理-降维 PCA主成成分分析
def dRPCA(data, COMPONENT_NUM=100):
print('dimensionality reduction...')
data = np.array(data)
'''
使用说明https://www.cnblogs.com/pinard/p/6243025.html
n_components>=1
n_components=NUM  设置占特征数量
0 < n_components < 1
n_components=0.99 设置阈值总方差占比
'''
pca = PCA(n_components=COMPONENT_NUM, random_state=34)
data_pca = pca.fit_transform(data)
clf = RandomForestClassifier(
n_estimators=100,
max_depth=20,
min_samples_split=20,
min_samples_leaf=1,
random_state=34)
# clf=LGBMClassifier(num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
# param_test1 = {'n_estimators':arange(10,150,10),'max_depth':arange(1,11,1)}
# gsearch1 = GridSearchCV(estimator = clf, param_grid = param_test1, scoring='accuracy',iid=False,cv=5)
# gsearch1.fit(Xtrain,xtest)
# print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
# pca 方差大小、方差占比、特征数量
print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n',
pca.n_components_)
print(sum(pca.explained_variance_ratio_))
storeModel(data_pca, os.path.join(data_dir, 'output/Result_sklearn_rf.pcaData'))
return data_pca
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
zeroLable = y_test - y_predict
rightCount = 0
for i in range(len(zeroLable)):
if list(zeroLable)[i] == 0:
rightCount += 1
print('the right rate is:', float(rightCount) / len(zeroLable))
# 训练模型
def trainModel(X_train, y_train):
print('Train RF...')
clf = RandomForestClassifier(
n_estimators=140,
max_depth=20,
min_samples_split=2,
min_samples_leaf=1,
random_state=34)
clf.fit(X_train, y_train) # 训练rf
result = clf.predict(data_pca[len(train_data):])
# clf=LGBMClassifier(num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
n, _ = test_data.shape
with open(os.path.join(data_dir, 'output/Result_sklearn_RF.csv'), 'w') as fw:
fw.write('{},{}\n'.format('ImageId', 'Label'))
for i in range(1, n + 1):
fw.write('{},{}\n'.format(i, result[i - 1]))
# param_test1 = {'n_estimators':arange(10,150,10),'max_depth':arange(1,21,1)}
# gsearch1 = GridSearchCV(estimator = clf, param_grid = param_test1, scoring='accuracy',iid=False,cv=5)
# gsearch1.fit(X_train, y_train)
# print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
# clf=gsearch1.best_estimator_
return clf
# 计算准确率
def printAccuracy(y_test ,y_predict):
zeroLable = y_test - y_predict
rightCount = 0
for i in range(len(zeroLable)):
if list(zeroLable)[i] == 0:
rightCount += 1
print('the right rate is:', float(rightCount) / len(zeroLable))
# 存储模型
def storeModel(model, filename):
import pickle
with open(filename, 'wb') as fw:
pickle.dump(model, fw)
# 加载模型
def getModel(filename):
import pickle
fr = open(filename, 'rb')
return pickle.load(fr)
# 结果输出保存
def saveResult(result, csvName):
i = 0
fw = open(csvName, 'w')
with open(os.path.join(data_dir, 'output/sample_submission.csv')
) as pred_file:
fw.write('{},{}\n'.format('ImageId', 'Label'))
for line in pred_file.readlines()[1:]:
splits = line.strip().split(',')
fw.write('{},{}\n'.format(splits[0], result[i]))
i += 1
fw.close()
print('Result saved successfully...')
def trainRF():
start_time = time.time()
# 加载数据
train_data, test_data, data, label = opencsv()
print("load data finish")
stop_time_l = time.time()
print('load data time used:%f s' % (stop_time_l - start_time))
startTime = time.time()
# 模型训练 (数据预处理-降维)
data_pca = dRPCA(data,100)
X_train, X_test, y_train, y_test = train_test_split(
data_pca[0:len(train_data)], label, test_size=0.1, random_state=34)
rfClf = trainModel(X_train, y_train)
# 保存结果
storeModel(data_pca[len(train_data):], os.path.join(data_dir, 'output/Result_sklearn_rf.pcaPreData'))
storeModel(rfClf, os.path.join(data_dir, 'output/Result_sklearn_rf.model'))
# 模型准确率
y_predict = rfClf.predict(X_test)
printAccuracy(y_test, y_predict)
print("finish!")
stopTime = time.time()
print('TrainModel store time used:%f s' % (stopTime - startTime))
def preRF():
startTime = time.time()
# 加载模型和数据
clf=getModel(os.path.join(data_dir, 'output/Result_sklearn_rf.model'))
pcaPreData = getModel(os.path.join(data_dir, 'output/Result_sklearn_rf.pcaPreData'))
# 结果预测
result = clf.predict(pcaPreData)
# 结果的输出
saveResult(result,os.path.join(data_dir, 'output/Result_sklearn_rf.csv'))
print("finish!")
stopTime = time.time()
print('PreModel load time used:%f s' % (stopTime - startTime))
if __name__ == '__main__':
# 训练并保存模型
trainRF()
# 加载预测数据集
preRF()