✨ Add regressor class with variable-random trees

2019-12-11 22:17:36 +09:00 · 2019-12-11 22:17:36 +09:00 · 0876594a42
parent 5840324a2a
commit 0876594a42
3 changed files with 216 additions and 0 deletions
--- a/lib/rumale.rb
+++ b/lib/rumale.rb
@ -61,6 +61,7 @@ require 'rumale/ensemble/random_forest_regressor'
 require 'rumale/ensemble/extra_trees_classifier'
 require 'rumale/ensemble/extra_trees_regressor'
 require 'rumale/ensemble/variable_random_trees_classifier'
+require 'rumale/ensemble/variable_random_trees_regressor'
 require 'rumale/clustering/k_means'
 require 'rumale/clustering/k_medoids'
 require 'rumale/clustering/gaussian_mixture'
--- a/lib/rumale/ensemble/variable_random_trees_regressor.rb
+++ b/lib/rumale/ensemble/variable_random_trees_regressor.rb
@ -0,0 +1,124 @@
+# frozen_string_literal: true
+
+require 'rumale/tree/variable_random_tree_regressor'
+require 'rumale/ensemble/random_forest_regressor'
+
+module Rumale
+  module Ensemble
+    # VariableRandomTreesRegressor is a class that implements variable-random trees for regression
+    #
+    # @example
+    #   estimator =
+    #     Rumale::Ensemble::VariableRandomTreesRegressor.new(
+    #       n_estimators: 10, criterion: 'mse', max_depth: 3, max_leaf_nodes: 10, min_samples_leaf: 5, random_seed: 1)
+    #   estimator.fit(training_samples, traininig_values)
+    #   results = estimator.predict(testing_samples)
+    #
+    # *Reference*
+    # - F. T. Liu, K. M. Ting, Y. Yu, and Z-H. Zhou, "Spectrum of Variable-Random Trees," Journal of Artificial Intelligence Research, vol. 32, pp. 355--384, 2008.
+    class VariableRandomTreesRegressor < RandomForestRegressor
+      # Return the set of estimators.
+      # @return [Array<VariableRandomTreeRegressor>]
+      attr_reader :estimators
+
+      # Return the importance for each feature.
+      # @return [Numo::DFloat] (size: n_features)
+      attr_reader :feature_importances
+
+      # Return the random generator for random selection of feature index.
+      # @return [Random]
+      attr_reader :rng
+
+      # Create a new regressor with extremely randomized trees.
+      #
+      # @param n_estimators [Integer] The numeber of trees for contructing extremely randomized trees.
+      # @param criterion [String] The function to evalue spliting point. Supported criteria are 'gini' and 'entropy'.
+      # @param max_depth [Integer] The maximum depth of the tree.
+      #   If nil is given, variable-random tree grows without concern for depth.
+      # @param max_leaf_nodes [Integer] The maximum number of leaves on variable-random tree.
+      #   If nil is given, number of leaves is not limited.
+      # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
+      # @param max_features [Integer] The number of features to consider when searching optimal split point.
+      #   If nil is given, split process considers 'Math.sqrt(n_features)' features.
+      # @param n_jobs [Integer] The number of jobs for running the fit and predict methods in parallel.
+      #   If nil is given, the methods do not execute in parallel.
+      #   If zero or less is given, it becomes equal to the number of processors.
+      #   This parameter is ignored if the Parallel gem is not loaded.
+      # @param random_seed [Integer] The seed value using to initialize the random generator.
+      #   It is used to randomly determine the order of features when deciding spliting point.
+      def initialize(n_estimators: 10,
+                     criterion: 'mse', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+                     max_features: nil, n_jobs: nil, random_seed: nil)
+        check_params_numeric_or_nil(max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+                                    max_features: max_features, n_jobs: n_jobs, random_seed: random_seed)
+        check_params_numeric(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
+        check_params_string(criterion: criterion)
+        check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
+                              max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
+                              max_features: max_features)
+        super
+      end
+
+      # Fit the model with given training data.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
+      # @param y [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values to be used for fitting the model.
+      # @return [VariableRandomTreesRegressor] The learned regressor itself.
+      def fit(x, y)
+        x = check_convert_sample_array(x)
+        y = check_convert_tvalue_array(y)
+        check_sample_tvalue_size(x, y)
+        # Initialize some variables.
+        n_features = x.shape[1]
+        @params[:max_features] = Math.sqrt(n_features).to_i if @params[:max_features].nil?
+        @params[:max_features] = [[1, @params[:max_features]].max, n_features].min
+        sub_rng = @rng.dup
+        # Construct forest.
+        alpha_step = 0.5 / @params[:n_estimators]
+        alpha_vals = Array.new(@params[:n_estimators]) { |n| alpha_step * n }
+        rng_seeds = Array.new(@params[:n_estimators]) { sub_rng.rand(Rumale::Values.int_max) }
+        @estimators = if enable_parallel?
+                        parallel_map(@params[:n_estimators]) { |n| plant_tree(alpha_vals[n], rng_seeds[n]).fit(x, y) }
+                      else
+                        Array.new(@params[:n_estimators]) { |n| plant_tree(alpha_vals[n], rng_seeds[n]).fit(x, y) }
+                      end
+        @feature_importances =
+          if enable_parallel?
+            parallel_map(@params[:n_estimators]) { |n| @estimators[n].feature_importances }.reduce(&:+)
+          else
+            @estimators.map(&:feature_importances).reduce(&:+)
+          end
+        @feature_importances /= @feature_importances.sum
+        self
+      end
+
+      # Predict values for samples.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the values.
+      # @return [Numo::DFloat] (shape: [n_samples, n_outputs]) Predicted value per sample.
+      def predict(x)
+        x = check_convert_sample_array(x)
+        super
+      end
+
+      # Return the index of the leaf that each sample reached.
+      #
+      # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to assign each leaf.
+      # @return [Numo::Int32] (shape: [n_samples, n_estimators]) Leaf index for sample.
+      def apply(x)
+        x = check_convert_sample_array(x)
+        super
+      end
+
+      private
+
+      def plant_tree(alpha, rnd_seed)
+        Tree::VariableRandomTreeRegressor.new(
+          criterion: @params[:criterion], max_depth: @params[:max_depth],
+          max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
+          max_features: @params[:max_features], alpha: alpha, random_seed: rnd_seed
+        )
+      end
+    end
+  end
+end
--- a/spec/rumale/ensemble/variable_random_trees_regressor_spec.rb
+++ b/spec/rumale/ensemble/variable_random_trees_regressor_spec.rb
@ -0,0 +1,91 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Rumale::Ensemble::VariableRandomTreesRegressor do
+  let(:x) { two_clusters_dataset[0] }
+  let(:n_samples) { x.shape[0] }
+  let(:n_features) { x.shape[1] }
+  let(:n_estimators) { 10 }
+  let(:n_jobs) { nil }
+  let(:estimator) do
+    described_class.new(n_estimators: n_estimators, criterion: 'mae', max_features: 2, n_jobs: n_jobs, random_seed: 9).fit(x, y)
+  end
+  let(:predicted) { estimator.predict(x) }
+  let(:score) { estimator.score(x, y) }
+
+  context 'when single target problem' do
+    let(:y) { x[true, 0] + x[true, 1]**2 }
+    let(:index_mat) { estimator.apply(x) }
+    let(:copied) { Marshal.load(Marshal.dump(estimator)) }
+
+    it 'learns the model for single regression problem.', :aggregate_failures do
+      expect(estimator.params[:n_estimators]).to eq(n_estimators)
+      expect(estimator.params[:criterion]).to eq('mae')
+      expect(estimator.params[:max_features]).to eq(2)
+      expect(estimator.estimators.class).to eq(Array)
+      expect(estimator.estimators.size).to eq(n_estimators)
+      expect(estimator.estimators[0].class).to eq(Rumale::Tree::VariableRandomTreeRegressor)
+      expect(estimator.feature_importances.class).to eq(Numo::DFloat)
+      expect(estimator.feature_importances.ndim).to eq(1)
+      expect(estimator.feature_importances.shape[0]).to eq(n_features)
+      expect(predicted.class).to eq(Numo::DFloat)
+      expect(predicted.ndim).to eq(1)
+      expect(predicted.shape[0]).to eq(n_samples)
+      expect(score).to be_within(0.01).of(1.0)
+    end
+
+    it 'returns leaf index that each sample reached.', :aggregate_failures do
+      expect(index_mat.ndim).to eq(2)
+      expect(index_mat.shape[0]).to eq(n_samples)
+      expect(index_mat.shape[1]).to eq(n_estimators)
+      expect(index_mat[true, 0]).to eq(estimator.estimators[0].apply(x))
+    end
+
+    it 'dumps and restores itself using Marshal module.', :aggregate_failures do
+      expect(estimator.class).to eq(copied.class)
+      expect(estimator.params).to match(copied.params)
+      expect(estimator.estimators.size).to eq(copied.estimators.size)
+      expect(estimator.feature_importances).to eq(copied.feature_importances)
+      expect(estimator.rng).to eq(copied.rng)
+      expect(score).to eq(copied.score(x, y))
+    end
+  end
+
+  context 'when multi-target problem' do
+    let(:y) { Numo::DFloat[x[true, 0].to_a, (x[true, 1]**2).to_a].transpose.dot(Numo::DFloat[[0.6, 0.4], [0.0, 0.1]]) }
+    let(:n_outputs) { y.shape[1] }
+
+    it 'learns the model for multiple regression problem.', :aggregate_failures do
+      expect(estimator.estimators.class).to eq(Array)
+      expect(estimator.estimators.size).to eq(n_estimators)
+      expect(estimator.estimators[0].class).to eq(Rumale::Tree::VariableRandomTreeRegressor)
+      expect(estimator.feature_importances.class).to eq(Numo::DFloat)
+      expect(estimator.feature_importances.ndim).to eq(1)
+      expect(estimator.feature_importances.shape[0]).to eq(n_features)
+      expect(predicted.class).to eq(Numo::DFloat)
+      expect(predicted.ndim).to eq(2)
+      expect(predicted.shape[0]).to eq(n_samples)
+      expect(predicted.shape[1]).to eq(n_outputs)
+      expect(score).to be_within(0.01).of(1.0)
+    end
+
+    context 'when n_jobs parameter is not nil' do
+      let(:n_jobs) { -1 }
+
+      it 'learns the model for multiple regression problem in parallel.', :aggregate_failures do
+        expect(estimator.estimators.class).to eq(Array)
+        expect(estimator.estimators.size).to eq(n_estimators)
+        expect(estimator.estimators[0].class).to eq(Rumale::Tree::VariableRandomTreeRegressor)
+        expect(estimator.feature_importances.class).to eq(Numo::DFloat)
+        expect(estimator.feature_importances.ndim).to eq(1)
+        expect(estimator.feature_importances.shape[0]).to eq(n_features)
+        expect(predicted.class).to eq(Numo::DFloat)
+        expect(predicted.ndim).to eq(2)
+        expect(predicted.shape[0]).to eq(n_samples)
+        expect(predicted.shape[1]).to eq(n_outputs)
+        expect(score).to be_within(0.01).of(1.0)
+      end
+    end
+  end
+end