Merge pull request #70 from chris-at-thewebfellas/master
possible fix for issue #68 and #66 and #63
This commit is contained in:
commit
690157af8b
|
@ -210,8 +210,7 @@ module Treat::Entities::Entity::Buildable
|
|||
file.index('.xml')
|
||||
from_serialized_file(file)
|
||||
else
|
||||
fmt = Treat::Workers::Formatters::
|
||||
Readers::Autoselect.detect_format(file,def_fmt)
|
||||
fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
|
||||
from_raw_file(file, fmt)
|
||||
end
|
||||
|
||||
|
|
|
@ -58,16 +58,14 @@ module Treat::Entities::Entity::Stringable
|
|||
end
|
||||
|
||||
# Helper method to implode the string value of the subtree.
|
||||
def implode
|
||||
def implode(value = "")
|
||||
|
||||
return @value.dup if !has_children?
|
||||
|
||||
value = ''
|
||||
|
||||
each do |child|
|
||||
|
||||
if child.is_a?(Treat::Entities::Section)
|
||||
value += "\n\n"
|
||||
value << "\n\n"
|
||||
end
|
||||
|
||||
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
||||
|
@ -75,14 +73,14 @@ module Treat::Entities::Entity::Stringable
|
|||
child.is_a?(Treat::Entities::Enclitic)
|
||||
value.strip!
|
||||
end
|
||||
value += child.to_s + ' '
|
||||
value << child.to_s + ' '
|
||||
else
|
||||
value += child.implode
|
||||
child.implode(value)
|
||||
end
|
||||
|
||||
if child.is_a?(Treat::Entities::Title) ||
|
||||
child.is_a?(Treat::Entities::Paragraph)
|
||||
value += "\n\n"
|
||||
value << "\n\n"
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
# Language detection using a probabilistic algorithm
|
||||
# that checks for the presence of words with Bloom
|
||||
# that checks for the presence of words with Bloom
|
||||
# filters built from dictionaries for each language.
|
||||
#
|
||||
# Original paper: Grothoff. 2007. A Quick Introduction to
|
||||
# Bloom Filters. Department of Computer Sciences, Purdue
|
||||
# Original paper: Grothoff. 2007. A Quick Introduction to
|
||||
# Bloom Filters. Department of Computer Sciences, Purdue
|
||||
# University.
|
||||
class Treat::Workers::Extractors::Language::WhatLanguage
|
||||
|
||||
|
@ -35,7 +35,7 @@ class Treat::Workers::Extractors::Language::WhatLanguage
|
|||
|
||||
options = DefaultOptions.merge(options)
|
||||
|
||||
@@detector ||= ::WhatLanguage.new(:possibilities)
|
||||
@@detector ||= ::WhatLanguage.new(:all)
|
||||
possibilities = @@detector.process_text(entity.to_s)
|
||||
lang = {}
|
||||
|
||||
|
|
|
@ -60,8 +60,7 @@ class Treat::Workers::Lexicalizers::Sensers::Wordnet
|
|||
|
||||
lemma.synsets.each do |synset|
|
||||
synsets <<
|
||||
Treat::Workers::Lexicalizers::
|
||||
Sensers::Wordnet::Synset.new(synset)
|
||||
Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
||||
end
|
||||
|
||||
((synsets.collect do |ss|
|
||||
|
|
|
@ -5,12 +5,10 @@ class Treat::Workers::Processors::Chunkers::Autoselect
|
|||
entity.set :format, 'txt'
|
||||
end
|
||||
begin
|
||||
k = Treat::Workers::Processors::
|
||||
Chunkers.const_get(entity.format.cc)
|
||||
k = Treat::Workers::Processors::Chunkers.const_get(entity.format.cc)
|
||||
k.chunk(entity, options)
|
||||
rescue Treat::Exception
|
||||
Treat::Workers::Processors::
|
||||
Chunkers::TXT.chunk(entity, options)
|
||||
Treat::Workers::Processors::Chunkers::TXT.chunk(entity, options)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -12,16 +12,13 @@ class Treat::Workers::Processors::Chunkers::TXT
|
|||
zones.each do |zone|
|
||||
zone.strip!
|
||||
next if zone == ''
|
||||
c = Treat::Entities::
|
||||
Zone.from_string(zone)
|
||||
c = Treat::Entities::Zone.from_string(zone)
|
||||
if c.type == :title
|
||||
if current.type == :section
|
||||
current = current.parent
|
||||
current = entity << Treat::
|
||||
Entities::Section.new
|
||||
current = entity << Treat::Entities::Section.new
|
||||
else
|
||||
current = entity << Treat::
|
||||
Entities::Section.new
|
||||
current = entity << Treat::Entities::Section.new
|
||||
end
|
||||
end
|
||||
current << c
|
||||
|
|
|
@ -28,13 +28,10 @@ class Treat::Workers::Processors::Tokenizers::Punkt
|
|||
|
||||
s.scan(ReWordTokenizer).each do |token|
|
||||
if SentEndChars.include?(token[-1])
|
||||
entity << Treat::Entities::
|
||||
Token.from_string(token[0..-2])
|
||||
entity << Treat::Entities::
|
||||
Token.from_string(token[-1..-1])
|
||||
entity << Treat::Entities::Token.from_string(token[0..-2])
|
||||
entity << Treat::Entities::Token.from_string(token[-1..-1])
|
||||
else
|
||||
entity << Treat::Entities::
|
||||
Token.from_string(token)
|
||||
entity << Treat::Entities::Token.from_string(token)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
module Treat::Specs::Entities
|
||||
|
||||
|
||||
describe Treat::Entities::Collection do
|
||||
|
||||
before :all do
|
||||
|
@ -17,7 +17,7 @@ module Treat::Specs::Entities
|
|||
it "recursively searches the folder for " +
|
||||
"files and opens them into a collection of documents" do
|
||||
collection = Treat::Entities::Collection.build(@file)
|
||||
collection.size.should eql 5
|
||||
collection.children.size.should eql 5
|
||||
end
|
||||
|
||||
end
|
||||
|
@ -64,7 +64,7 @@ end
|
|||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
describe "#search" do
|
||||
|
||||
it "searches an indexed collection for a query " +
|
||||
|
@ -77,7 +77,7 @@ end
|
|||
|
||||
docs = collection.search :ferret, :q => 'Newton'
|
||||
docs.size.should eql 3
|
||||
|
||||
|
||||
docs.map { |d| d.chunk.title.to_s }.should
|
||||
eql [
|
||||
"Isaac (Sir) Newton (1642-1727)",
|
||||
|
@ -107,6 +107,6 @@ end
|
|||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
=end
|
||||
|
|
|
@ -33,6 +33,24 @@ module Treat::Specs::Entities
|
|||
@adj_phrase << @adj
|
||||
@verb_phrase << [@aux, @verb]
|
||||
|
||||
@enc_phrase = Treat::Entities::Phrase.new
|
||||
@enc_noun_phrase = Treat::Entities::Phrase.new
|
||||
@enc_noun_phrase.set :tag, 'NP'
|
||||
@enc_verb_phrase = Treat::Entities::Phrase.new
|
||||
@enc_verb_phrase.set :tag, 'VP'
|
||||
@enc_pronoun = Treat::Entities::Word.new('It')
|
||||
@enc_pronoun.set :category, 'pronoun'
|
||||
@enc_pronoun.set :tag, 'PRP'
|
||||
@enc_enclitic = Treat::Entities::Enclitic.new('\'s')
|
||||
@enc_enclitic.set :category, 'verb'
|
||||
@enc_enclitic.set :tag, 'VBZ'
|
||||
@enc_adj = Treat::Entities::Word.new('hot')
|
||||
@enc_adj.set :category, 'adjectival'
|
||||
@enc_adj.set :tag, 'ADJP'
|
||||
|
||||
@enc_noun_phrase << @enc_pronoun
|
||||
@enc_verb_phrase << [ @enc_enclitic, @enc_adj ]
|
||||
@enc_phrase << [ @enc_noun_phrase, @enc_verb_phrase ]
|
||||
end
|
||||
|
||||
|
||||
|
@ -67,7 +85,7 @@ module Treat::Specs::Entities
|
|||
end
|
||||
|
||||
=begin
|
||||
|
||||
|
||||
describe "#frequency" do
|
||||
|
||||
it "returns the frequency of the entity's value in the root" do
|
||||
|
@ -82,9 +100,9 @@ module Treat::Specs::Entities
|
|||
it "returns the position of the entity's value "+
|
||||
"in the supplied parent type, or root if nil" do
|
||||
@noun_phrase.frequency_in(:sentence).should eql 1
|
||||
|
||||
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
=end
|
||||
|
@ -100,8 +118,8 @@ module Treat::Specs::Entities
|
|||
|
||||
Treat::Entities::Entity.call_worker(
|
||||
'$'.to_entity, :tag, :lingua,
|
||||
Treat::Workers::Lexicalizers::Taggers, {}).should
|
||||
eql '$'.tag(:lingua)
|
||||
Treat::Workers::Lexicalizers::Taggers, {}).
|
||||
should eql '$'.tag(:lingua)
|
||||
|
||||
end
|
||||
|
||||
|
@ -284,25 +302,29 @@ module Treat::Specs::Entities
|
|||
describe "#to_s" do
|
||||
it "returns the string value of the " +
|
||||
"entity or its full subtree" do
|
||||
@paragraph.to_s.should
|
||||
eql 'The lazy fox is running.'
|
||||
@paragraph.to_s.
|
||||
should eql 'The lazy fox is running.'
|
||||
@noun.to_s.should eql 'fox'
|
||||
@enc_phrase.to_s.
|
||||
should eql 'It\'s hot'
|
||||
end
|
||||
end
|
||||
|
||||
describe "#inspect" do
|
||||
it "returns an informative string " +
|
||||
"concerning the entity" do
|
||||
@paragraph.inspect.should
|
||||
be_an_instance_of String
|
||||
@paragraph.inspect.
|
||||
should be_an_instance_of String
|
||||
end
|
||||
end
|
||||
|
||||
describe "#short_value" do
|
||||
it "returns a shortened version of the " +
|
||||
"entity's string value" do
|
||||
@paragraph.short_value.should
|
||||
eql 'The lazy fox is running.'
|
||||
@paragraph.short_value.
|
||||
should eql 'The lazy fox is running.'
|
||||
@enc_phrase.short_value.
|
||||
should eql 'It\'s hot'
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -406,14 +428,15 @@ module Treat::Specs::Entities
|
|||
it "guesses the language of the entity" do
|
||||
|
||||
Treat.core.language.detect = true
|
||||
a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
|
||||
b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
|
||||
c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
|
||||
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
|
||||
a = 'I want to know God\'s thoughts; the rest are details.' # Albert Einstein
|
||||
b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran?' # Pablo Picasso
|
||||
c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France.' # Goethe
|
||||
d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen.' # Friedrich Nietzsche
|
||||
|
||||
a.language.should eql :english
|
||||
#b.language.should eql :spanish
|
||||
#c.language.should eql :french
|
||||
#d.language.should eql :german
|
||||
b.language.should eql :spanish
|
||||
c.language.should eql :french
|
||||
d.language.should eql :german
|
||||
|
||||
# Reset default
|
||||
Treat.core.language.detect = false
|
||||
|
|
Loading…
Reference in New Issue