Merge pull request #245 from apachecn/dev

定期合并 - Dev
2018-06-07 11:01:37 +08:00 · 2018-06-07 11:01:37 +08:00 · 90668ef37b
parent f108a703a5 02650e4a4f
commit 90668ef37b
4 changed files with 506 additions and 11 deletions
--- a/src/python/getting-started/digit-recognizer/cnn_keras-python3.6.py
+++ b/src/python/getting-started/digit-recognizer/cnn_keras-python3.6.py
@ -17,13 +17,16 @@ from keras.models import Sequential
 from keras.optimizers import RMSprop
 from keras.preprocessing.image import ImageDataGenerator
 from keras.utils.np_utils import to_categorical  # convert to one-hot-encoding
+import os

 np.random.seed(2)

+# 数据路径
+data_dir = '/media/wsw/B634091A3408DF6D/data/kaggle/datasets/getting-started/digit-recognizer/'
+
 # Load the data
-train = pd.read_csv(
-    r'datasets/getting-started/digit-recognizer/input/train.csv')
-test = pd.read_csv(r'datasets/getting-started/digit-recognizer/input/test.csv')
+train = pd.read_csv(os.path.join(data_dir, 'input/train.csv'))
+test = pd.read_csv(os.path.join(data_dir, 'input/test.csv'))

 X_train = train.values[:, 1:]
 Y_train = train.values[:, 0]
@ -129,6 +132,5 @@ submission = pd.concat(
    [pd.Series(
        range(1, 28001), name="ImageId"), results], axis=1)

-submission.to_csv(
-    "datasets/getting-started/digit-recognizer/ouput/Result_keras_CNN.csv",
-    index=False)
+submission.to_csv(os.path.join(data_dir, "output/Result_keras_CNN.csv",index=False))
+print('finished')
--- a/src/python/getting-started/digit-recognizer/cnn_pytorch-python3.6.py
+++ b/src/python/getting-started/digit-recognizer/cnn_pytorch-python3.6.py
@ -17,14 +17,17 @@ import torch
 import torch.nn as nn
 from torch.autograd import Variable
 from torch.utils.data import Dataset, DataLoader
+import os.path

+# 数据路径
+data_dir = '/media/wsw/B634091A3408DF6D/data/kaggle/datasets/getting-started/digit-recognizer/'

 class CustomedDataSet(Dataset):
    def __init__(self, train=True):
        self.train = train
        if self.train:
            trainX = pd.read_csv(
-                '/opt/data/kaggle/getting-started/digit-recognizer/input/train.csv'
+                os.path.join(data_dir, 'input/train.csv')
                # names=["ImageId", "Label"]
            )
            trainY = trainX.label.as_matrix().tolist()
@ -34,7 +37,7 @@ class CustomedDataSet(Dataset):
            self.labellist = trainY
        else:
            testX = pd.read_csv(
-                '/opt/data/kaggle/getting-started/digit-recognizer/input/test.csv'
+                os.path.join(data_dir, 'input/test.csv')
            )
            self.testID = testX.index
            testX = testX.as_matrix().reshape(testX.shape[0], 1, 28, 28)
@ -178,6 +181,6 @@ submission_df = pd.DataFrame(
          'Label': testLabel})
 # print(submission_df.head(10))
 submission_df.to_csv(
-    '/opt/data/kaggle/getting-started/digit-recognizer/output/Result_pytorch_CNN.csv',
+    os.path.join(data_dir, 'output/Result_pytorch_CNN.csv'),
    columns=["ImageId", "Label"],
    index=False)
--- a/src/python/getting-started/digit-recognizer/knn-python3.6.py
+++ b/src/python/getting-started/digit-recognizer/knn-python3.6.py
@ -60,7 +60,7 @@ def dRPCA(x_train, x_test, COMPONENT_NUM):
    0 < n_components < 1
      n_components=0.99  设置阈值总方差占比
    '''
-    pca = PCA(n_components=COMPONENT_NUM, whiten=True)
+    pca = PCA(n_components=COMPONENT_NUM, whiten=False)
    pca.fit(trainData)  # Fit the model with X
    pcaTrainData = pca.transform(trainData)  # Fit the model with X and 在X上完成降维.
    pcaTestData = pca.transform(testData)  # Fit the model with X and 在X上完成降维.
@ -85,7 +85,7 @@ def dRecognition_knn():
    print('load data time used:%f' % (stop_time_l - start_time))

    # 降维处理
-    trainData, testData = dRPCA(trainData, testData, 35)
+    trainData, testData = dRPCA(trainData, testData, 0.8)

    # 模型训练
    knnClf = knnClassify(trainData, trainLabel)
--- a/src/python/getting-started/house-prices/deeplearning_method.py
+++ b/src/python/getting-started/house-prices/deeplearning_method.py
@ -0,0 +1,490 @@
+# -*- coding: utf-8 -*-
+__author__ = 'liudong'
+__date__ = '2018/5/29 下午7:40'
+import csv
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import matplotlib.pyplot as plt
+from tensorflow.python.framework import ops
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+
+def load_data(train_path, test_path):
+    """
+    加载数据的方法
+    :param train_path: path for the train set file
+    :param test_path: path for the test set file
+    :return: a 'pandas' array for each set
+    """
+
+    train_data = pd.read_csv(train_path)
+    test_data = pd.read_csv(test_path)
+
+    print("number of training examples = " + str(train_data.shape[0])) # 1460
+    print("number of test examples = " + str(test_data.shape[0])) # 1459
+    print("train shape: " + str(train_data.shape)) # (1460, 81)
+    print("test shape: " + str(test_data.shape)) #  (1459, 80)
+
+    return train_data, test_data
+
+
+
+
+def pre_process_data(df):
+    """
+    Perform a number of pre process functions on the data set
+    :param df: pandas data frame
+    :return: processed data frame
+    """
+    # one-hot encode categorical values
+    df = pd.get_dummies(df)
+
+    return df
+
+
+def mini_batches(train_set, train_labels, mini_batch_size):
+    """
+    Generate mini batches from the data set (data and labels)
+    :param train_set: data set with the examples
+    :param train_labels: data set with the labels
+    :param mini_batch_size: mini batch size
+    :return: mini batches
+    """
+    set_size = train_set.shape[0]
+    batches = []
+    num_complete_minibatches = set_size // mini_batch_size
+
+    for k in range(0, num_complete_minibatches):
+        mini_batch_x = train_set[k * mini_batch_size: (k + 1) * mini_batch_size]
+        mini_batch_y = train_labels[k * mini_batch_size: (k + 1) * mini_batch_size]
+        mini_batch = (mini_batch_x, mini_batch_y)
+        batches.append(mini_batch)
+
+    # Handling the end case (last mini-batch < mini_batch_size)
+    if set_size % mini_batch_size != 0:
+        mini_batch_x = train_set[(set_size - (set_size % mini_batch_size)):]
+        mini_batch_y = train_labels[(set_size - (set_size % mini_batch_size)):]
+        mini_batch = (mini_batch_x, mini_batch_y)
+        batches.append(mini_batch)
+
+    return batches
+
+
+def create_placeholders(input_size, output_size):
+    """
+    Creates the placeholders for the tensorflow session.
+    :param input_size: scalar, input size
+    :param output_size: scalar, output size
+    :return: X  placeholder for the data input, of shape [None, input_size] and dtype "float"
+    :return: Y placeholder for the input labels, of shape [None, output_size] and dtype "float"
+    """
+
+    x = tf.placeholder(shape=(None, input_size), dtype=tf.float32, name="X")
+    y = tf.placeholder(shape=(None, output_size), dtype=tf.float32, name="Y")
+
+    return x, y
+
+
+def forward_propagation(x, parameters, keep_prob=1.0, hidden_activation='relu'):
+    """
+    Implement forward propagation with dropout for the [LINEAR->RELU]*(L-1)->LINEAR-> computation
+    :param x: data, pandas array of shape (input size, number of examples)
+    :param parameters: output of initialize_parameters()
+    :param keep_prob: probability to keep each node of the layer
+    :param hidden_activation: activation function of the hidden layers
+    :return: last LINEAR value
+    """
+
+    a_dropout = x
+    n_layers = len(parameters) // 2  # number of layers in the neural network
+
+    for l in range(1, n_layers):
+        a_prev = a_dropout
+        a_dropout = linear_activation_forward(a_prev, parameters['w%s' % l], parameters['b%s' % l], hidden_activation)
+
+        if keep_prob < 1.0:
+            a_dropout = tf.nn.dropout(a_dropout, keep_prob)
+
+    al = tf.matmul(a_dropout, parameters['w%s' % n_layers]) + parameters['b%s' % n_layers]
+
+    return al
+
+
+def linear_activation_forward(a_prev, w, b, activation):
+    """
+    Implement the forward propagation for the LINEAR->ACTIVATION layer
+    :param a_prev: activations from previous layer (or input data): (size of previous layer, number of examples)
+    :param w: weights matrix: numpy array of shape (size of current layer, size of previous layer)
+    :param b: bias vector, numpy array of shape (size of the current layer, 1)
+    :param activation: the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
+    :return: the output of the activation function, also called the post-activation value
+    """
+
+    a = None
+    if activation == "sigmoid":
+        z = tf.matmul(a_prev, w) + b
+        a = tf.nn.sigmoid(z)
+
+    elif activation == "relu":
+        z = tf.matmul(a_prev, w) + b
+        a = tf.nn.relu(z)
+
+    elif activation == "leaky relu":
+        z = tf.matmul(a_prev, w) + b
+        a = tf.nn.leaky_relu(z)
+
+    return a
+
+
+def initialize_parameters(layer_dims):
+    """
+    :param layer_dims: python array (list) containing the dimensions of each layer in our network
+    :return: python dictionary containing your parameters "w1", "b1", ..., "wn", "bn":
+                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
+                    bl -- bias vector of shape (layer_dims[l], 1)
+    """
+
+    parameters = {}
+    n_layers = len(layer_dims)
+
+    for l in range(1, n_layers):
+        parameters['w' + str(l)] = tf.get_variable('w' + str(l), [layer_dims[l - 1], layer_dims[l]],
+                                                   initializer=tf.contrib.layers.xavier_initializer())
+        parameters['b' + str(l)] = tf.get_variable('b' + str(l), [layer_dims[l]], initializer=tf.zeros_initializer())
+
+    return parameters
+
+
+def compute_cost(z3, y):
+    """
+    :param z3: output of forward propagation (output of the last LINEAR unit)
+    :param y: "true" labels vector placeholder, same shape as Z3
+    :return: Tensor of the cost function (RMSE as it is a regression)
+    """
+
+    cost = tf.sqrt(tf.reduce_mean(tf.square(y - z3)))
+
+    return cost
+
+
+def predict(data, parameters):
+    """
+    make a prediction based on a data set and parameters
+    :param data: based data set
+    :param parameters: based parameters
+    :return: array of predictions
+    """
+
+    init = tf.global_variables_initializer()
+    with tf.Session() as sess:
+        sess.run(init)
+
+        dataset = tf.cast(tf.constant(data), tf.float32)
+        fw_prop_result = forward_propagation(dataset, parameters)
+        prediction = fw_prop_result.eval()
+
+    return prediction
+
+
+def rmse(predictions, labels):
+    """
+    calculate cost between two data sets
+    :param predictions: data set of predictions
+    :param labels: data set of labels (real values)
+    :return: percentage of correct predictions
+    """
+
+    prediction_size = predictions.shape[0]
+    prediction_cost = np.sqrt(np.sum(np.square(labels - predictions)) / prediction_size)
+
+    return prediction_cost
+
+
+def rmsle(predictions, labels):
+    """
+    calculate cost between two data sets
+    :param predictions: data set of predictions
+    :param labels: data set of labels (real values)
+    :return: percentage of correct predictions
+    """
+
+    prediction_size = predictions.shape[0]
+    prediction_cost = np.sqrt(np.sum(np.square(np.log(predictions + 1) - np.log(labels + 1))) / prediction_size)
+
+    return prediction_cost
+
+
+def l2_regularizer(cost, l2_beta, parameters, n_layers):
+    """
+    Function to apply l2 regularization to the model
+    :param cost: usual cost of the model
+    :param l2_beta: beta value used for the normalization
+    :param parameters: parameters from the model (used to get weights values)
+    :param n_layers: number of layers of the model
+    :return: cost updated
+    """
+
+    regularizer = 0
+    for i in range(1, n_layers):
+        regularizer += tf.nn.l2_loss(parameters['w%s' % i])
+
+    cost = tf.reduce_mean(cost + l2_beta * regularizer)
+
+    return cost
+
+
+def build_submission_name(layers_dims, num_epochs, lr_decay,
+                          learning_rate, l2_beta, keep_prob, minibatch_size, num_examples):
+    """
+    builds a string (submission file name), based on the model parameters
+    :param layers_dims: model layers dimensions
+    :param num_epochs: model number of epochs
+    :param lr_decay: model learning rate decay
+    :param learning_rate: model learning rate
+    :param l2_beta: beta used on l2 normalization
+    :param keep_prob: keep probability used on dropout normalization
+    :param minibatch_size: model mini batch size (0 to do not use mini batches)
+    :param num_examples: number of model examples (training data)
+    :return: built string
+    """
+    submission_name = 'ly{}-epoch{}.csv' \
+        .format(layers_dims, num_epochs)
+
+    if lr_decay != 0:
+        submission_name = 'lrdc{}-'.format(lr_decay) + submission_name
+    else:
+        submission_name = 'lr{}-'.format(learning_rate) + submission_name
+
+    if l2_beta > 0:
+        submission_name = 'l2{}-'.format(l2_beta) + submission_name
+
+    if keep_prob < 1:
+        submission_name = 'dk{}-'.format(keep_prob) + submission_name
+
+    if minibatch_size != num_examples:
+        submission_name = 'mb{}-'.format(minibatch_size) + submission_name
+
+    return submission_name
+
+
+def plot_model_cost(train_costs, validation_costs, submission_name):
+    """
+    :param train_costs: array with the costs from the model training
+    :param validation_costs: array with the costs from the model validation
+    :param submission_name: name of the submission (used for the plot title)
+    :return:
+    """
+    plt.plot(np.squeeze(train_costs), label='Train cost')
+    plt.plot(np.squeeze(validation_costs), label='Validation cost')
+    plt.ylabel('cost')
+    plt.xlabel('iterations (per tens)')
+    plt.title("Model: " + submission_name)
+    plt.legend()
+    plt.show()
+    plt.close()
+
+
+def model(train_set, train_labels, validation_set, validation_labels, layers_dims, learning_rate=0.01, num_epochs=1001,
+          print_cost=True, plot_cost=True, l2_beta=0., keep_prob=1.0, hidden_activation='relu', return_best=False,
+          minibatch_size=0, lr_decay=0):
+    """
+    :param train_set: training set
+    :param train_labels: training labels
+    :param validation_set: validation set
+    :param validation_labels: validation labels
+    :param layers_dims: array with the layer for the model
+    :param learning_rate: learning rate of the optimization
+    :param num_epochs: number of epochs of the optimization loop
+    :param print_cost: True to print the cost every 500 epochs
+    :param plot_cost: True to plot the train and validation cost
+    :param l2_beta: beta parameter for the l2 regularization
+    :param keep_prob: probability to keep each node of each hidden layer (dropout)
+    :param hidden_activation: activation function to be used on the hidden layers
+    :param return_best: True to return the highest params from all epochs
+    :param minibatch_size: size of th mini batch
+    :param lr_decay: if != 0, sets de learning rate decay on each epoch
+    :return parameters: parameters learnt by the model. They can then be used to predict.
+    :return submission_name: name for the trained model
+    """
+
+    ops.reset_default_graph()  # to be able to rerun the model without overwriting tf variables
+
+    input_size = layers_dims[0]
+    output_size = layers_dims[-1]
+    num_examples = train_set.shape[0]
+    n_layers = len(layers_dims)
+    train_costs = []
+    validation_costs = []
+    best_iteration = [float('inf'), 0]
+    best_params = None
+
+    if minibatch_size == 0 or minibatch_size > num_examples:
+        minibatch_size = num_examples
+
+    num_minibatches = num_examples // minibatch_size
+
+    if num_minibatches == 0:
+        num_minibatches = 1
+
+    submission_name = build_submission_name(layers_dims, num_epochs, lr_decay, learning_rate, l2_beta, keep_prob,
+                                            minibatch_size, num_examples)
+
+    x, y = create_placeholders(input_size, output_size)
+    tf_valid_dataset = tf.cast(tf.constant(validation_set), tf.float32)
+    parameters = initialize_parameters(layers_dims)
+
+    fw_output_train = forward_propagation(x, parameters, keep_prob, hidden_activation)
+    train_cost = compute_cost(fw_output_train, y)
+
+    fw_output_valid = forward_propagation(tf_valid_dataset, parameters, keep_prob, hidden_activation)
+    validation_cost = compute_cost(fw_output_valid, validation_labels)
+
+    if l2_beta > 0:
+        train_cost = l2_regularizer(train_cost, l2_beta, parameters, n_layers)
+        validation_cost = l2_regularizer(validation_cost, l2_beta, parameters, n_layers)
+
+    if lr_decay != 0:
+        global_step = tf.Variable(0, trainable=False)
+        learning_rate = tf.train.inverse_time_decay(learning_rate, global_step=global_step, decay_rate=lr_decay,
+                                                    decay_steps=1)
+        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(train_cost, global_step=global_step)
+    else:
+        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(train_cost)
+
+    # uncomment to use tensorboard
+    # tf.summary.scalar('train cost', train_cost)
+    # tf.summary.scalar('validation cost', validation_cost)
+
+    init = tf.global_variables_initializer()
+
+    with tf.Session() as sess:
+        # uncomment to use tensorboard
+        # writer = tf.summary.FileWriter('logs/'+submission_name, sess.graph)
+
+        sess.run(init)
+
+        for epoch in range(num_epochs):
+            train_epoch_cost = 0.
+            validation_epoch_cost = 0.
+
+            minibatches = mini_batches(train_set, train_labels, minibatch_size)
+
+            for minibatch in minibatches:
+                # uncomment to use tensorboard
+                # merge = tf.summary.merge_all()
+
+                (minibatch_X, minibatch_Y) = minibatch
+                feed_dict = {x: minibatch_X, y: minibatch_Y}
+
+                # uncomment to use tensorboard
+                # _, summary, minibatch_train_cost, minibatch_validation_cost = sess.run(
+                #     [optimizer, merge, train_cost, validation_cost], feed_dict=feed_dict)
+
+                # comment to use tensorboard
+                _, minibatch_train_cost, minibatch_validation_cost = sess.run(
+                    [optimizer, train_cost, validation_cost], feed_dict=feed_dict)
+
+                train_epoch_cost += minibatch_train_cost / num_minibatches
+                validation_epoch_cost += minibatch_validation_cost / num_minibatches
+
+            if print_cost is True and epoch % 500 == 0:
+                print("Train cost after epoch %i: %f" % (epoch, train_epoch_cost))
+                print("Validation cost after epoch %i: %f" % (epoch, validation_epoch_cost))
+
+            if plot_cost is True and epoch % 10 == 0:
+                train_costs.append(train_epoch_cost)
+                validation_costs.append(validation_epoch_cost)
+
+            # uncomment to use tensorboard
+            # if epoch % 10 == 0:
+            #     writer.add_summary(summary, epoch)
+
+            if return_best is True and validation_epoch_cost < best_iteration[0]:
+                best_iteration[0] = validation_epoch_cost
+                best_iteration[1] = epoch
+                best_params = sess.run(parameters)
+
+        if return_best is True:
+            parameters = best_params
+        else:
+            parameters = sess.run(parameters)
+
+        print("Parameters have been trained, getting metrics...")
+
+        train_rmse = rmse(predict(train_set, parameters), train_labels)
+        validation_rmse = rmse(predict(validation_set, parameters), validation_labels)
+        train_rmsle = rmsle(predict(train_set, parameters), train_labels)
+        validation_rmsle = rmsle(predict(validation_set, parameters), validation_labels)
+
+        print('Train rmse: {:.4f}'.format(train_rmse))
+        print('Validation rmse: {:.4f}'.format(validation_rmse))
+        print('Train rmsle: {:.4f}'.format(train_rmsle))
+        print('Validation rmsle: {:.4f}'.format(validation_rmsle))
+
+        submission_name = 'tr_cost-{:.2f}-vd_cost{:.2f}-'.format(train_rmse, validation_rmse) + submission_name
+
+        if return_best is True:
+            print('Lowest rmse: {:.2f} at epoch {}'.format(best_iteration[0], best_iteration[1]))
+
+        if plot_cost is True:
+            plot_model_cost(train_costs, validation_costs, submission_name)
+
+        return parameters, submission_name
+
+TRAIN_PATH = '/Users/liudong/Desktop/house_price/train.csv'
+TEST_PATH = '/Users/liudong/Desktop/house_price/test.csv'
+
+train, test = load_data(TRAIN_PATH, TEST_PATH)
+
+# get the labels values
+train_raw_labels = train['SalePrice'].to_frame().as_matrix()
+
+# pre process data sets
+train_pre = pre_process_data(train)
+test_pre = pre_process_data(test)
+
+# drop unwanted columns
+train_pre = train_pre.drop(['Id', 'SalePrice'], axis=1)
+test_pre = test_pre.drop(['Id'], axis=1)
+
+# align both data sets (by outer join), to make they have the same amount of features,
+# this is required because of the mismatched categorical values in train and test sets
+train_pre, test_pre = train_pre.align(test_pre, join='outer', axis=1)
+
+# replace the nan values added by align for 0
+train_pre.replace(to_replace=np.nan, value=0, inplace=True)
+test_pre.replace(to_replace=np.nan, value=0, inplace=True)
+
+train_pre = train_pre.as_matrix().astype(np.float)
+test_pre = test_pre.as_matrix().astype(np.float)
+
+# scale values
+standard_scaler = preprocessing.StandardScaler()
+train_pre = standard_scaler.fit_transform(train_pre)
+test_pre = standard_scaler.fit_transform(test_pre)
+
+X_train, X_valid, Y_train, Y_valid = train_test_split(train_pre, train_raw_labels, test_size=0.3, random_state=1)
+
+# 模型的超参数设置
+input_size = train_pre.shape[1]
+output_size = 1
+num_epochs = 10000
+learning_rate = 0.01
+layers_dims = [input_size, 500, 500, output_size]
+parameters, submission_name = model(X_train, Y_train, X_valid, Y_valid, layers_dims, num_epochs=num_epochs,
+                                    learning_rate=learning_rate, print_cost=True, plot_cost=True, l2_beta=10,
+                                    keep_prob=0.7, minibatch_size=0, return_best=True)
+
+print(submission_name)
+prediction = list(map(lambda val: float(val), predict(test_pre, parameters)))
+# uncomment if label was log transformed
+# prediction = list(map(lambda val: np.expm1(val), prediction))
+# output_submission(test.Id.values, prediction, 'Id', 'SalePrice', submission_name)
+# 保存结果
+result = pd.DataFrame()
+result['Id'] = test.Id.values
+result['SalePrice'] = prediction
+# index=False 是用来除去行编号
+result.to_csv('/Users/liudong/Desktop/house_price/result1.csv', index=False)
+print('##########结束训练##########')