Merge pull request #70 from chris-at-thewebfellas/master

possible fix for issue #68 and #66 and #63
2014-02-18 19:26:51 -05:00 · 2014-02-18 19:26:51 -05:00 · 690157af8b
parent 10e6612a06 a8ce6b2f18
commit 690157af8b
9 changed files with 65 additions and 54 deletions
--- a/lib/treat/entities/entity/buildable.rb
+++ b/lib/treat/entities/entity/buildable.rb
@ -210,8 +210,7 @@ module Treat::Entities::Entity::Buildable
      file.index('.xml')
      from_serialized_file(file)
    else
-      fmt = Treat::Workers::Formatters::
-      Readers::Autoselect.detect_format(file,def_fmt)
+      fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
      from_raw_file(file, fmt)
    end

--- a/lib/treat/entities/entity/stringable.rb
+++ b/lib/treat/entities/entity/stringable.rb
@ -58,16 +58,14 @@ module Treat::Entities::Entity::Stringable
   end
   
  # Helper method to implode the string value of the subtree.
-  def implode
+  def implode(value = "")
    
    return @value.dup if !has_children?
    
-    value = ''
-
    each do |child|
      
      if child.is_a?(Treat::Entities::Section)
-        value += "\n\n"
+        value << "\n\n"
      end
      
      if child.is_a?(Treat::Entities::Token) || child.value != ''
@ -75,14 +73,14 @@ module Treat::Entities::Entity::Stringable
          child.is_a?(Treat::Entities::Enclitic)
          value.strip!
        end
-        value += child.to_s + ' '
+        value << child.to_s + ' '
      else
-        value += child.implode
+        child.implode(value)
      end
      
      if child.is_a?(Treat::Entities::Title) ||
        child.is_a?(Treat::Entities::Paragraph)
-        value += "\n\n"
+        value << "\n\n"
      end
      
    end
--- a/lib/treat/workers/extractors/language/what_language.rb
+++ b/lib/treat/workers/extractors/language/what_language.rb
@ -1,9 +1,9 @@
 # Language detection using a probabilistic algorithm
-# that checks for the presence of words with Bloom 
+# that checks for the presence of words with Bloom
 # filters built from dictionaries for each language.
 #
-# Original paper: Grothoff. 2007. A Quick Introduction to 
-# Bloom Filters. Department of Computer Sciences, Purdue 
+# Original paper: Grothoff. 2007. A Quick Introduction to
+# Bloom Filters. Department of Computer Sciences, Purdue
 # University.
 class Treat::Workers::Extractors::Language::WhatLanguage

@ -35,7 +35,7 @@ class Treat::Workers::Extractors::Language::WhatLanguage

    options = DefaultOptions.merge(options)

-    @@detector ||= ::WhatLanguage.new(:possibilities)
+    @@detector ||= ::WhatLanguage.new(:all)
    possibilities = @@detector.process_text(entity.to_s)
    lang = {}

--- a/lib/treat/workers/lexicalizers/sensers/wordnet.rb
+++ b/lib/treat/workers/lexicalizers/sensers/wordnet.rb
@ -60,8 +60,7 @@ class Treat::Workers::Lexicalizers::Sensers::Wordnet
    
    lemma.synsets.each do |synset|
      synsets << 
-      Treat::Workers::Lexicalizers::
-      Sensers::Wordnet::Synset.new(synset)
+      Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
    end
    
    ((synsets.collect do |ss|
--- a/lib/treat/workers/processors/chunkers/autoselect.rb
+++ b/lib/treat/workers/processors/chunkers/autoselect.rb
@ -5,12 +5,10 @@ class Treat::Workers::Processors::Chunkers::Autoselect
      entity.set :format, 'txt'
    end
    begin
-      k = Treat::Workers::Processors::
-      Chunkers.const_get(entity.format.cc)
+      k = Treat::Workers::Processors::Chunkers.const_get(entity.format.cc)
      k.chunk(entity, options)
    rescue Treat::Exception
-      Treat::Workers::Processors::
-      Chunkers::TXT.chunk(entity, options)
+      Treat::Workers::Processors::Chunkers::TXT.chunk(entity, options)
    end
    
  end
--- a/lib/treat/workers/processors/chunkers/txt.rb
+++ b/lib/treat/workers/processors/chunkers/txt.rb
@ -12,16 +12,13 @@ class Treat::Workers::Processors::Chunkers::TXT
    zones.each do |zone|
      zone.strip!
      next if zone == ''
-      c = Treat::Entities::
-      Zone.from_string(zone)
+      c = Treat::Entities::Zone.from_string(zone)
      if c.type == :title
        if current.type == :section
          current = current.parent
-          current = entity << Treat::
-          Entities::Section.new
+          current = entity << Treat::Entities::Section.new
        else
-          current = entity << Treat::
-          Entities::Section.new
+          current = entity << Treat::Entities::Section.new
        end
      end
      current << c
--- a/lib/treat/workers/processors/tokenizers/punkt.rb
+++ b/lib/treat/workers/processors/tokenizers/punkt.rb
@ -28,13 +28,10 @@ class Treat::Workers::Processors::Tokenizers::Punkt
    
    s.scan(ReWordTokenizer).each do |token|
      if SentEndChars.include?(token[-1])
-        entity << Treat::Entities::
-        Token.from_string(token[0..-2])
-        entity << Treat::Entities::
-        Token.from_string(token[-1..-1])
+        entity << Treat::Entities::Token.from_string(token[0..-2])
+        entity << Treat::Entities::Token.from_string(token[-1..-1])
      else
-        entity << Treat::Entities::
-        Token.from_string(token)
+        entity << Treat::Entities::Token.from_string(token)
      end
    end
    
--- a/spec/entities/collection.rb
+++ b/spec/entities/collection.rb
@ -1,5 +1,5 @@
 module Treat::Specs::Entities
-  
+
  describe Treat::Entities::Collection do

    before :all do
@ -17,7 +17,7 @@ module Treat::Specs::Entities
          it "recursively searches the folder for " +
          "files and opens them into a collection of documents" do
            collection = Treat::Entities::Collection.build(@file)
-            collection.size.should eql 5
+            collection.children.size.should eql 5
          end

        end
@ -64,7 +64,7 @@ end
      end

    end
-    
+
    describe "#search" do

      it "searches an indexed collection for a query " +
@ -77,7 +77,7 @@ end

        docs = collection.search :ferret, :q => 'Newton'
        docs.size.should eql 3
-        
+
        docs.map { |d| d.chunk.title.to_s }.should
        eql [
          "Isaac (Sir) Newton (1642-1727)",
@ -107,6 +107,6 @@ end
    end

  end
-  
+
 end
 =end
--- a/spec/entities/entity.rb
+++ b/spec/entities/entity.rb
@ -33,6 +33,24 @@ module Treat::Specs::Entities
      @adj_phrase << @adj
      @verb_phrase << [@aux, @verb]

+      @enc_phrase = Treat::Entities::Phrase.new
+      @enc_noun_phrase = Treat::Entities::Phrase.new
+      @enc_noun_phrase.set :tag, 'NP'
+      @enc_verb_phrase = Treat::Entities::Phrase.new
+      @enc_verb_phrase.set :tag, 'VP'
+      @enc_pronoun = Treat::Entities::Word.new('It')
+      @enc_pronoun.set :category, 'pronoun'
+      @enc_pronoun.set :tag, 'PRP'
+      @enc_enclitic = Treat::Entities::Enclitic.new('\'s')
+      @enc_enclitic.set :category, 'verb'
+      @enc_enclitic.set :tag, 'VBZ'
+      @enc_adj = Treat::Entities::Word.new('hot')
+      @enc_adj.set :category, 'adjectival'
+      @enc_adj.set :tag, 'ADJP'
+
+      @enc_noun_phrase << @enc_pronoun
+      @enc_verb_phrase << [ @enc_enclitic, @enc_adj ]
+      @enc_phrase << [ @enc_noun_phrase, @enc_verb_phrase ]
    end


@ -67,7 +85,7 @@ module Treat::Specs::Entities
      end

 =begin
-    
+
    describe "#frequency" do

      it "returns the frequency of the entity's value in the root" do
@ -82,9 +100,9 @@ module Treat::Specs::Entities
      it "returns the position of the entity's value "+
         "in the supplied parent type, or root if nil" do
           @noun_phrase.frequency_in(:sentence).should eql 1
-          
+
      end
-      
+
    end

 =end
@ -100,8 +118,8 @@ module Treat::Specs::Entities

          Treat::Entities::Entity.call_worker(
          '$'.to_entity, :tag, :lingua,
-          Treat::Workers::Lexicalizers::Taggers, {}).should
-          eql '$'.tag(:lingua)
+          Treat::Workers::Lexicalizers::Taggers, {}).
+          should  eql '$'.tag(:lingua)

        end

@ -284,25 +302,29 @@ module Treat::Specs::Entities
      describe "#to_s" do
        it "returns the string value of the " +
        "entity or its full subtree" do
-          @paragraph.to_s.should
-          eql 'The lazy fox is running.'
+          @paragraph.to_s.
+          should eql 'The lazy fox is running.'
          @noun.to_s.should eql 'fox'
+          @enc_phrase.to_s.
+          should eql 'It\'s hot'
        end
      end

      describe "#inspect" do
        it "returns an informative string " +
        "concerning the entity" do
-          @paragraph.inspect.should
-          be_an_instance_of String
+          @paragraph.inspect.
+          should be_an_instance_of String
        end
      end

      describe "#short_value" do
        it "returns a shortened version of the " +
        "entity's string value" do
-          @paragraph.short_value.should
-          eql 'The lazy fox is running.'
+          @paragraph.short_value.
+          should eql 'The lazy fox is running.'
+          @enc_phrase.short_value.
+          should eql 'It\'s hot'
        end
      end

@ -406,14 +428,15 @@ module Treat::Specs::Entities
          it "guesses the language of the entity" do

            Treat.core.language.detect = true
-            a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
-            b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
-            c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
-            d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
+            a = 'I want to know God\'s thoughts; the rest are details.' # Albert Einstein
+            b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran?' # Pablo Picasso
+            c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France.' # Goethe
+            d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen.' # Friedrich Nietzsche
+
            a.language.should eql :english
-            #b.language.should eql :spanish
-            #c.language.should eql :french
-            #d.language.should eql :german
+            b.language.should eql :spanish
+            c.language.should eql :french
+            d.language.should eql :german

            # Reset default
            Treat.core.language.detect = false