Merge pull request #70 from chris-at-thewebfellas/master

possible fix for issue #68 and #66 and #63
This commit is contained in:
Louis Mullie 2014-02-18 19:26:51 -05:00
commit 690157af8b
9 changed files with 65 additions and 54 deletions

View File

@ -210,8 +210,7 @@ module Treat::Entities::Entity::Buildable
file.index('.xml')
from_serialized_file(file)
else
fmt = Treat::Workers::Formatters::
Readers::Autoselect.detect_format(file,def_fmt)
fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
from_raw_file(file, fmt)
end

View File

@ -58,16 +58,14 @@ module Treat::Entities::Entity::Stringable
end
# Helper method to implode the string value of the subtree.
def implode
def implode(value = "")
return @value.dup if !has_children?
value = ''
each do |child|
if child.is_a?(Treat::Entities::Section)
value += "\n\n"
value << "\n\n"
end
if child.is_a?(Treat::Entities::Token) || child.value != ''
@ -75,14 +73,14 @@ module Treat::Entities::Entity::Stringable
child.is_a?(Treat::Entities::Enclitic)
value.strip!
end
value += child.to_s + ' '
value << child.to_s + ' '
else
value += child.implode
child.implode(value)
end
if child.is_a?(Treat::Entities::Title) ||
child.is_a?(Treat::Entities::Paragraph)
value += "\n\n"
value << "\n\n"
end
end

View File

@ -1,9 +1,9 @@
# Language detection using a probabilistic algorithm
# that checks for the presence of words with Bloom
# that checks for the presence of words with Bloom
# filters built from dictionaries for each language.
#
# Original paper: Grothoff. 2007. A Quick Introduction to
# Bloom Filters. Department of Computer Sciences, Purdue
# Original paper: Grothoff. 2007. A Quick Introduction to
# Bloom Filters. Department of Computer Sciences, Purdue
# University.
class Treat::Workers::Extractors::Language::WhatLanguage
@ -35,7 +35,7 @@ class Treat::Workers::Extractors::Language::WhatLanguage
options = DefaultOptions.merge(options)
@@detector ||= ::WhatLanguage.new(:possibilities)
@@detector ||= ::WhatLanguage.new(:all)
possibilities = @@detector.process_text(entity.to_s)
lang = {}

View File

@ -60,8 +60,7 @@ class Treat::Workers::Lexicalizers::Sensers::Wordnet
lemma.synsets.each do |synset|
synsets <<
Treat::Workers::Lexicalizers::
Sensers::Wordnet::Synset.new(synset)
Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
end
((synsets.collect do |ss|

View File

@ -5,12 +5,10 @@ class Treat::Workers::Processors::Chunkers::Autoselect
entity.set :format, 'txt'
end
begin
k = Treat::Workers::Processors::
Chunkers.const_get(entity.format.cc)
k = Treat::Workers::Processors::Chunkers.const_get(entity.format.cc)
k.chunk(entity, options)
rescue Treat::Exception
Treat::Workers::Processors::
Chunkers::TXT.chunk(entity, options)
Treat::Workers::Processors::Chunkers::TXT.chunk(entity, options)
end
end

View File

@ -12,16 +12,13 @@ class Treat::Workers::Processors::Chunkers::TXT
zones.each do |zone|
zone.strip!
next if zone == ''
c = Treat::Entities::
Zone.from_string(zone)
c = Treat::Entities::Zone.from_string(zone)
if c.type == :title
if current.type == :section
current = current.parent
current = entity << Treat::
Entities::Section.new
current = entity << Treat::Entities::Section.new
else
current = entity << Treat::
Entities::Section.new
current = entity << Treat::Entities::Section.new
end
end
current << c

View File

@ -28,13 +28,10 @@ class Treat::Workers::Processors::Tokenizers::Punkt
s.scan(ReWordTokenizer).each do |token|
if SentEndChars.include?(token[-1])
entity << Treat::Entities::
Token.from_string(token[0..-2])
entity << Treat::Entities::
Token.from_string(token[-1..-1])
entity << Treat::Entities::Token.from_string(token[0..-2])
entity << Treat::Entities::Token.from_string(token[-1..-1])
else
entity << Treat::Entities::
Token.from_string(token)
entity << Treat::Entities::Token.from_string(token)
end
end

View File

@ -1,5 +1,5 @@
module Treat::Specs::Entities
describe Treat::Entities::Collection do
before :all do
@ -17,7 +17,7 @@ module Treat::Specs::Entities
it "recursively searches the folder for " +
"files and opens them into a collection of documents" do
collection = Treat::Entities::Collection.build(@file)
collection.size.should eql 5
collection.children.size.should eql 5
end
end
@ -64,7 +64,7 @@ end
end
end
describe "#search" do
it "searches an indexed collection for a query " +
@ -77,7 +77,7 @@ end
docs = collection.search :ferret, :q => 'Newton'
docs.size.should eql 3
docs.map { |d| d.chunk.title.to_s }.should
eql [
"Isaac (Sir) Newton (1642-1727)",
@ -107,6 +107,6 @@ end
end
end
end
=end

View File

@ -33,6 +33,24 @@ module Treat::Specs::Entities
@adj_phrase << @adj
@verb_phrase << [@aux, @verb]
@enc_phrase = Treat::Entities::Phrase.new
@enc_noun_phrase = Treat::Entities::Phrase.new
@enc_noun_phrase.set :tag, 'NP'
@enc_verb_phrase = Treat::Entities::Phrase.new
@enc_verb_phrase.set :tag, 'VP'
@enc_pronoun = Treat::Entities::Word.new('It')
@enc_pronoun.set :category, 'pronoun'
@enc_pronoun.set :tag, 'PRP'
@enc_enclitic = Treat::Entities::Enclitic.new('\'s')
@enc_enclitic.set :category, 'verb'
@enc_enclitic.set :tag, 'VBZ'
@enc_adj = Treat::Entities::Word.new('hot')
@enc_adj.set :category, 'adjectival'
@enc_adj.set :tag, 'ADJP'
@enc_noun_phrase << @enc_pronoun
@enc_verb_phrase << [ @enc_enclitic, @enc_adj ]
@enc_phrase << [ @enc_noun_phrase, @enc_verb_phrase ]
end
@ -67,7 +85,7 @@ module Treat::Specs::Entities
end
=begin
describe "#frequency" do
it "returns the frequency of the entity's value in the root" do
@ -82,9 +100,9 @@ module Treat::Specs::Entities
it "returns the position of the entity's value "+
"in the supplied parent type, or root if nil" do
@noun_phrase.frequency_in(:sentence).should eql 1
end
end
=end
@ -100,8 +118,8 @@ module Treat::Specs::Entities
Treat::Entities::Entity.call_worker(
'$'.to_entity, :tag, :lingua,
Treat::Workers::Lexicalizers::Taggers, {}).should
eql '$'.tag(:lingua)
Treat::Workers::Lexicalizers::Taggers, {}).
should eql '$'.tag(:lingua)
end
@ -284,25 +302,29 @@ module Treat::Specs::Entities
describe "#to_s" do
it "returns the string value of the " +
"entity or its full subtree" do
@paragraph.to_s.should
eql 'The lazy fox is running.'
@paragraph.to_s.
should eql 'The lazy fox is running.'
@noun.to_s.should eql 'fox'
@enc_phrase.to_s.
should eql 'It\'s hot'
end
end
describe "#inspect" do
it "returns an informative string " +
"concerning the entity" do
@paragraph.inspect.should
be_an_instance_of String
@paragraph.inspect.
should be_an_instance_of String
end
end
describe "#short_value" do
it "returns a shortened version of the " +
"entity's string value" do
@paragraph.short_value.should
eql 'The lazy fox is running.'
@paragraph.short_value.
should eql 'The lazy fox is running.'
@enc_phrase.short_value.
should eql 'It\'s hot'
end
end
@ -406,14 +428,15 @@ module Treat::Specs::Entities
it "guesses the language of the entity" do
Treat.core.language.detect = true
a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
a = 'I want to know God\'s thoughts; the rest are details.' # Albert Einstein
b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran?' # Pablo Picasso
c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France.' # Goethe
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen.' # Friedrich Nietzsche
a.language.should eql :english
#b.language.should eql :spanish
#c.language.should eql :french
#d.language.should eql :german
b.language.should eql :spanish
c.language.should eql :french
d.language.should eql :german
# Reset default
Treat.core.language.detect = false