Merge pull request #203 from xuehuachunsheng/dev
哈哈 优化很棒 666 | Modified the data path of svm-python3.6.py and add the data visualization module
This commit is contained in:
commit
2fa86ff899
|
@ -14,15 +14,15 @@ import numpy as np
|
|||
import pandas as pd
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
import sys
|
||||
|
||||
data_dir = '/opt/data/kaggle/getting-started/digit-recognizer/'
|
||||
|
||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
# 使用 pandas 打开
|
||||
data = pd.read_csv(os.path.join(data_dir, 'train.csv'))
|
||||
data1 = pd.read_csv(os.path.join(data_dir, 'test.csv'))
|
||||
data = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
|
||||
data1 = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
|
||||
|
||||
train_data = data.values[0:, 1:] # 读入全部训练数据, [行,列]
|
||||
train_label = data.values[0:, 0] # 读取列表的第一列
|
||||
|
@ -95,9 +95,9 @@ def dRecognition_knn():
|
|||
|
||||
# 结果预测
|
||||
testLabel = knnClf.predict(testData)
|
||||
|
||||
|
||||
# 结果的输出
|
||||
saveResult(testLabel, os.path.join(data_dir, 'Result_sklearn_knn.csv'))
|
||||
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_knn.csv'))
|
||||
print("finish!")
|
||||
stop_time_r = time.time()
|
||||
print('classify time used:%f' % (stop_time_r - start_time))
|
||||
|
|
|
@ -9,22 +9,26 @@ Github: https://github.com/apachecn/kaggle
|
|||
PCA主成成分分析
|
||||
'''
|
||||
|
||||
import os.path
|
||||
import csv
|
||||
import time
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# 数据路径
|
||||
data_dir = '/Users/wuyanxue/Documents/GitHub/datasets/getting-started/digit-recognizer/'
|
||||
|
||||
# 加载数据
|
||||
def opencsv():
|
||||
print('Load Data...')
|
||||
# 使用 pandas 打开
|
||||
dataTrain = pd.read_csv('datasets/getting-started/digit-recognizer/input/train.csv')
|
||||
dataPre = pd.read_csv('datasets/getting-started/digit-recognizer/input/test.csv')
|
||||
dataTrain = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
|
||||
dataPre = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))
|
||||
trainData = dataTrain.values[:, 1:] # 读入全部训练数据
|
||||
trainLabel = dataTrain.values[:, 0]
|
||||
preData = dataPre.values[:, :] # 测试全部测试个数据
|
||||
|
@ -57,6 +61,7 @@ def dRCsv(x_train, x_test, preData, COMPONENT_NUM):
|
|||
return pcaTrainData, pcaTestData, pcaPreData
|
||||
|
||||
|
||||
|
||||
# 训练模型
|
||||
def trainModel(trainData, trainLabel):
|
||||
print('Train SVM...')
|
||||
|
@ -179,7 +184,6 @@ def getModel(filename):
|
|||
fr = open(filename, 'rb')
|
||||
return pickle.load(fr)
|
||||
|
||||
|
||||
def trainDRSVM():
|
||||
startTime = time.time()
|
||||
|
||||
|
@ -188,8 +192,8 @@ def trainDRSVM():
|
|||
# 模型训练 (数据预处理-降维)
|
||||
optimalSVMClf, pcaPreData = getOptimalAccuracy(trainData, trainLabel, preData)
|
||||
|
||||
storeModel(optimalSVMClf, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.model')
|
||||
storeModel(pcaPreData, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.pcaPreData')
|
||||
storeModel(optimalSVMClf, os.path.join(data_dir, 'output/Result_sklearn_SVM.model'))
|
||||
storeModel(pcaPreData, os.path.join(data_dir, 'output/Result_sklearn_SVM.pcaPreData'))
|
||||
|
||||
print("finish!")
|
||||
stopTime = time.time()
|
||||
|
@ -199,25 +203,42 @@ def trainDRSVM():
|
|||
def preDRSVM():
|
||||
startTime = time.time()
|
||||
# 加载模型和数据
|
||||
optimalSVMClf = getModel('datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.model')
|
||||
pcaPreData = getModel('datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.pcaPreData')
|
||||
optimalSVMClf = getModel(os.path.join(data_dir, 'output/Result_sklearn_SVM.model'))
|
||||
pcaPreData = getModel(os.path.join(data_dir, 'output/Result_sklearn_SVM.pcaPreData'))
|
||||
|
||||
# 结果预测
|
||||
testLabel = optimalSVMClf.predict(pcaPreData)
|
||||
# print("testLabel = %f" % testscore)
|
||||
# 结果的输出
|
||||
saveResult(testLabel, 'datasets/getting-started/digit-recognizer/ouput/Result_sklearn_SVM.csv')
|
||||
saveResult(testLabel, os.path.join(data_dir, 'output/Result_sklearn_SVM.csv'))
|
||||
print("finish!")
|
||||
stopTime = time.time()
|
||||
print('PreModel load time used:%f s' % (stopTime - startTime))
|
||||
|
||||
# 数据可视化
|
||||
def dataVisulization(data, labels):
|
||||
pca = PCA(n_components=2, whiten=True) # 使用PCA方法降到2维
|
||||
pca.fit(data)
|
||||
pcaData = pca.transform(data)
|
||||
uniqueClasses = set(labels)
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(1, 1, 1)
|
||||
for cClass in uniqueClasses:
|
||||
plt.scatter(pcaData[labels==cClass, 0], pcaData[labels==cClass, 1])
|
||||
plt.xlabel('$x_1$')
|
||||
plt.ylabel('$x_2$')
|
||||
plt.title('MNIST visualization')
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
trainData, trainLabel, preData = opencsv()
|
||||
dataVisulization(trainData, trainLabel)
|
||||
|
||||
|
||||
# 训练并保存模型
|
||||
trainDRSVM()
|
||||
#trainDRSVM()
|
||||
|
||||
# 分析数据
|
||||
analyse_data(trainData)
|
||||
#analyse_data(trainData)
|
||||
# 加载预测数据集
|
||||
# preDRSVM()
|
||||
#preDRSVM()
|
||||
|
|
Loading…
Reference in New Issue