Compare commits
254 Commits
config-ref
...
master
Author | SHA1 | Date |
---|---|---|
![]() |
8c6450a0d7 | |
![]() |
d6a163f8f7 | |
![]() |
58911ce969 | |
![]() |
7b6e8fe6a8 | |
![]() |
f1f8c010a6 | |
![]() |
a86c3036cc | |
![]() |
e6c1f7c3a4 | |
![]() |
4dfaf83a14 | |
![]() |
eed75d3ba7 | |
![]() |
3dc184277d | |
![]() |
e218791c88 | |
![]() |
2d6891ab9c | |
![]() |
f61adc8187 | |
![]() |
cb7ae61cc5 | |
![]() |
394edb8fc4 | |
![]() |
721215b64a | |
![]() |
90fdcd8c9e | |
![]() |
3992f2c177 | |
![]() |
159f3b266e | |
![]() |
b7c6ae452b | |
![]() |
e753389911 | |
![]() |
1b783faf77 | |
![]() |
febc47a905 | |
![]() |
3969478f7f | |
![]() |
fc77587390 | |
![]() |
eb0ec83053 | |
![]() |
f8e5f8392a | |
![]() |
c813e72c2c | |
![]() |
316a6cec04 | |
![]() |
d45d82b3c4 | |
![]() |
78c13c280f | |
![]() |
690157af8b | |
![]() |
a8ce6b2f18 | |
![]() |
f7bd0c35d7 | |
![]() |
65235fb935 | |
![]() |
0ed8a37761 | |
![]() |
8f82086bb1 | |
![]() |
d9b912f24d | |
![]() |
10e6612a06 | |
![]() |
5fb11131c7 | |
![]() |
281934d572 | |
![]() |
c9bf91562b | |
![]() |
643937b231 | |
![]() |
fff57a4526 | |
![]() |
49e99ccd0d | |
![]() |
136a7cdbb3 | |
![]() |
a8448f8103 | |
![]() |
91e3571699 | |
![]() |
6464bb64a0 | |
![]() |
5c984a1dbb | |
![]() |
20e3aa1bb7 | |
![]() |
e483b764e4 | |
![]() |
3a367f7c56 | |
![]() |
c99dfac530 | |
![]() |
ee98b19960 | |
![]() |
bf6d951cf7 | |
![]() |
1b28d3640d | |
![]() |
727a307af0 | |
![]() |
cf8acd2b64 | |
![]() |
1aa4df9566 | |
![]() |
89cf494cba | |
![]() |
e2ef8ea748 | |
![]() |
038d62b12b | |
![]() |
767e1f7f45 | |
![]() |
4cbcf03d43 | |
![]() |
5e4189218c | |
![]() |
482ed30d76 | |
![]() |
b9ee19e276 | |
![]() |
fe7f97afd7 | |
![]() |
e135ffa466 | |
![]() |
a755272934 | |
![]() |
8b8a769d74 | |
![]() |
e2b813ca21 | |
![]() |
f72c8aec54 | |
![]() |
6a025ca4d6 | |
![]() |
1f69c3f089 | |
![]() |
4b74913bcd | |
![]() |
a34714cb99 | |
![]() |
b37ecdb695 | |
![]() |
01c3e50ef0 | |
![]() |
ba5a2d78ad | |
![]() |
c6ceac22d6 | |
![]() |
c18d0073f7 | |
![]() |
056ef129d6 | |
![]() |
6e644db50a | |
![]() |
d351108ad0 | |
![]() |
cbca3d8d37 | |
![]() |
ebf8a16c2a | |
![]() |
683861dfd1 | |
![]() |
2cded62abf | |
![]() |
e32f44f0a0 | |
![]() |
e87574c03d | |
![]() |
6cf37a2926 | |
![]() |
ff80d8ab2c | |
![]() |
fdb5abb748 | |
![]() |
931a66ec40 | |
![]() |
644248504d | |
![]() |
0731d33024 | |
![]() |
e8b12c5d03 | |
![]() |
cc2b8d6679 | |
![]() |
e284aee7eb | |
![]() |
8209447625 | |
![]() |
201ac69775 | |
![]() |
c42ee58bcd | |
![]() |
970a6248d5 | |
![]() |
7fb10de5cc | |
![]() |
577a063165 | |
![]() |
9fa4ee9c1e | |
![]() |
fd0733113a | |
![]() |
50ac0f67e0 | |
![]() |
c2532d6cd2 | |
![]() |
0abeb6ba14 | |
![]() |
d38387265b | |
![]() |
abf16b2b3e | |
![]() |
20b49cc471 | |
![]() |
3295529e97 | |
![]() |
5087b53623 | |
![]() |
e149c46bde | |
![]() |
9836dabe87 | |
![]() |
02ccf9942f | |
![]() |
c2a7a33fae | |
![]() |
e9dd1a9298 | |
![]() |
3484baa904 | |
![]() |
2b8e269380 | |
![]() |
28669ec6c6 | |
![]() |
5daa79c2f0 | |
![]() |
72eb214650 | |
![]() |
9f89a006a4 | |
![]() |
ec3465d24d | |
![]() |
b76a5c2c15 | |
![]() |
3e2a898155 | |
![]() |
7ef79deaa5 | |
![]() |
e13ff4dcab | |
![]() |
796ae59d77 | |
![]() |
9f54b6daba | |
![]() |
66f0cadc57 | |
![]() |
a7b4a7eea8 | |
![]() |
2d9ef825c2 | |
![]() |
0e06750ede | |
![]() |
2c6c4ec964 | |
![]() |
4d60ceb6ea | |
![]() |
b57d047ea8 | |
![]() |
10abcb7e20 | |
![]() |
1ab18f42c3 | |
![]() |
b6be6e7e1d | |
![]() |
4577430e84 | |
![]() |
6c46ebf707 | |
![]() |
01125b8a38 | |
![]() |
a7991bd009 | |
![]() |
6cc94ca6c5 | |
![]() |
6f443b3087 | |
![]() |
5b22b0d3b8 | |
![]() |
7c1b597619 | |
![]() |
2394de746d | |
![]() |
7f83e78a04 | |
![]() |
1c69d6558e | |
![]() |
da096bd15e | |
![]() |
1e47f41510 | |
![]() |
d3e52fb36a | |
![]() |
b5528a34f1 | |
![]() |
e4664586fe | |
![]() |
a49d1d97a0 | |
![]() |
fbd5e48fae | |
![]() |
ee3ed723bc | |
![]() |
476962e126 | |
![]() |
57e3c88d9e | |
![]() |
bb4910c43d | |
![]() |
a1d09cbb99 | |
![]() |
c8020c90e6 | |
![]() |
5ac537db5a | |
![]() |
2162339419 | |
![]() |
be96c54064 | |
![]() |
a116cc8a5b | |
![]() |
12cac5c769 | |
![]() |
1eb9341f77 | |
![]() |
22b2afdddf | |
![]() |
01d8f656b2 | |
![]() |
9af52ec798 | |
![]() |
30196bb699 | |
![]() |
f2edcf8118 | |
![]() |
b20836bc04 | |
![]() |
3fa0e8593b | |
![]() |
cd044f1c59 | |
![]() |
5144f43c63 | |
![]() |
f0d63fea20 | |
![]() |
7f93decac7 | |
![]() |
f6862357e6 | |
![]() |
19813f6a14 | |
![]() |
295e22417d | |
![]() |
9fb2c9a127 | |
![]() |
e5f8da4201 | |
![]() |
386d56de70 | |
![]() |
24e02802b9 | |
![]() |
18fce305ed | |
![]() |
49cfd6a6f7 | |
![]() |
5c1ca9d9f1 | |
![]() |
f7f53b0f7e | |
![]() |
a6b42ca20e | |
![]() |
d3f6df6266 | |
![]() |
b20a5dd412 | |
![]() |
d09dd391b7 | |
![]() |
a8170397ff | |
![]() |
c182ca8a42 | |
![]() |
8b8a1c93f3 | |
![]() |
ba7d0995d6 | |
![]() |
c7aa48e130 | |
![]() |
22f17a5b4c | |
![]() |
61d683603c | |
![]() |
3c925e579f | |
![]() |
f01f050f9d | |
![]() |
afefa10e87 | |
![]() |
d2a41c0dea | |
![]() |
651c8490f2 | |
![]() |
1355122e2f | |
![]() |
65311ceda9 | |
![]() |
0240847ce1 | |
![]() |
3ae63233c2 | |
![]() |
86db70f1e8 | |
![]() |
2c7aff0323 | |
![]() |
98b39ef541 | |
![]() |
f4975270b2 | |
![]() |
e73463c0d0 | |
![]() |
e995da0e71 | |
![]() |
aa99e71058 | |
![]() |
8baa98930a | |
![]() |
7997a5d6b7 | |
![]() |
c0e98efb76 | |
![]() |
be72e2499a | |
![]() |
2495d9cf05 | |
![]() |
e588051558 | |
![]() |
25689e5ef1 | |
![]() |
decb377e55 | |
![]() |
5535c03141 | |
![]() |
7a902900b2 | |
![]() |
69d7c9ff8a | |
![]() |
56902944c1 | |
![]() |
a713441546 | |
![]() |
d43e6be5c8 | |
![]() |
e8baae7513 | |
![]() |
f2c22b42f4 | |
![]() |
5a750834ae | |
![]() |
363eaca919 | |
![]() |
06913afa8a | |
![]() |
9e755535c6 | |
![]() |
4dd56750fb | |
![]() |
544a2a115e | |
![]() |
19a59eaae8 | |
![]() |
233894a5d8 | |
![]() |
a34fca4e88 | |
![]() |
a36bd07777 | |
![]() |
13175e8f25 | |
![]() |
7b30c26686 | |
![]() |
502b21797f | |
![]() |
e79c2dbdb8 |
|
@ -13,4 +13,5 @@
|
|||
*.yaml
|
||||
spec/sandbox.rb
|
||||
coverage/*
|
||||
benchmark/*
|
||||
TODO
|
11
.travis.yml
11
.travis.yml
|
@ -1,11 +1,18 @@
|
|||
language: ruby
|
||||
|
||||
rvm:
|
||||
- 1.9.2
|
||||
- 1.9.3
|
||||
- 2.0
|
||||
- 2.1
|
||||
- 2.2
|
||||
|
||||
before_install:
|
||||
- export "JAVA_HOME=/usr/lib/jvm/java-6-openjdk-i386/"
|
||||
before_script:
|
||||
|
||||
before_script:
|
||||
- sudo apt-get install antiword
|
||||
- sudo apt-get install poppler-utils
|
||||
- rake treat:install[travis] --trace
|
||||
script: rake treat:spec --trace
|
||||
|
||||
script: rake treat:spec --trace
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
# A boolean value indicating whether to silence
|
||||
# the output of external libraries (e.g. Stanford
|
||||
# tools, Enju, LDA, Ruby-FANN, Schiphol).
|
||||
Treat.core.verbosity.silence = false
|
||||
|
||||
# A boolean value indicating whether to explain
|
||||
# the steps that Treat is performing.
|
||||
Treat.core.verbosity.debug = true
|
||||
|
||||
# A boolean value indicating whether Treat should
|
||||
# try to detect the language of newly input text.
|
||||
Treat.core.language.detect = false
|
||||
|
||||
# A string representing the language to default
|
||||
# to when detection is off.
|
||||
Treat.core.language.default = 'english'
|
||||
|
||||
# A symbol representing the finest level at which
|
||||
# language detection should be performed if language
|
||||
# detection is turned on.
|
||||
Treat.core.language.detect_at = :document
|
||||
|
||||
# The directory containing executables and JAR files.
|
||||
Treat.paths.bin = '##_INSTALLER_BIN_PATH_##'
|
||||
|
||||
# The directory containing trained models
|
||||
Treat.paths.models = '##_INSTALLER_MODELS_PATH_##'
|
||||
|
||||
# Mongo database configuration.
|
||||
Treat.databases.mongo.db = 'your_database'
|
||||
Treat.databases.mongo.host = 'localhost'
|
||||
Treat.databases.mongo.port = '27017'
|
||||
|
||||
# Include the DSL by default.
|
||||
include Treat::Core::DSL
|
57
Gemfile
57
Gemfile
|
@ -1,48 +1,45 @@
|
|||
source :rubygems
|
||||
source 'https://rubygems.org'
|
||||
|
||||
gemspec
|
||||
|
||||
gem 'birch'
|
||||
gem 'schiphol'
|
||||
gem 'sourcify'
|
||||
gem 'yomu'
|
||||
gem 'ruby-readability'
|
||||
gem 'nokogiri'
|
||||
|
||||
group :test do
|
||||
gem 'rspec', '2.9.0'
|
||||
gem 'rspec'
|
||||
gem 'rake'
|
||||
gem 'terminal-table'
|
||||
gem 'simplecov'
|
||||
end
|
||||
|
||||
=begin
|
||||
gem 'nokogiri'
|
||||
gem 'psych'
|
||||
gem 'mongoid'
|
||||
gem 'mongo'
|
||||
gem 'bson_ext'
|
||||
|
||||
gem 'zip'
|
||||
gem 'ferret'
|
||||
gem 'lda-ruby'
|
||||
gem 'stanford-core-nlp'
|
||||
gem 'linguistics'
|
||||
gem 'ruby-readability'
|
||||
gem 'whatlanguage'
|
||||
gem 'chronic'
|
||||
gem 'nickel'
|
||||
gem 'engtagger'
|
||||
gem 'open-nlp'
|
||||
gem 'stanford-core-nlp'
|
||||
gem 'rwordnet'
|
||||
gem 'scalpel'
|
||||
gem 'fastimage'
|
||||
gem 'decisiontree'
|
||||
gem 'rb-libsvm'
|
||||
gem 'ai4r'
|
||||
gem 'whatlanguage'
|
||||
gem 'zip'
|
||||
gem 'nickel'
|
||||
gem 'tactful_tokenizer'
|
||||
gem 'srx-english'
|
||||
gem 'punkt-segmenter'
|
||||
gem 'chronic'
|
||||
gem 'uea-stemmer'
|
||||
gem 'rbtagger'
|
||||
gem 'ruby-stemmer'
|
||||
gem 'punkt-segmenter'
|
||||
gem 'tactful_tokenizer'
|
||||
gem 'nickel'
|
||||
gem 'rwordnet'
|
||||
gem 'uea-stemmer'
|
||||
gem 'engtagger'
|
||||
gem 'activesupport'
|
||||
gem 'srx-english'
|
||||
gem 'scalpel'
|
||||
=end
|
||||
|
||||
# english?
|
||||
gem 'rb-libsvm'
|
||||
gem 'tomz-liblinear-ruby-swig'
|
||||
gem 'ruby-fann'
|
||||
gem 'fuzzy-string-match'
|
||||
gem 'levenshtein-ffi'
|
||||
gem 'tf-idf-similarity'
|
||||
gem 'kronic'
|
||||
=end
|
4
LICENSE
4
LICENSE
|
@ -1,4 +1,4 @@
|
|||
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.2
|
||||
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 2.0.0
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
|
@ -15,7 +15,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
|
||||
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
||||
|
||||
Non-trivial amount of code has been incorporated and modified from other libraries:
|
||||
A non-trivial amount of code has been incorporated and modified from other libraries:
|
||||
|
||||
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
||||
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
||||
|
|
44
README.md
44
README.md
|
@ -1,35 +1,43 @@
|
|||
[](http://travis-ci.org/#!/louismullie/treat)
|
||||
[](https://gemnasium.com/louismullie/treat)
|
||||
[](https://codeclimate.com/github/louismullie/treat)
|
||||
[](https://codeclimate.com/github/louismullie/treat)
|
||||
|
||||
Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition.
|
||||

|
||||
|
||||
**Current features**
|
||||
**New in v2.0.5: [OpenNLP integration](https://github.com/louismullie/treat/commit/727a307af0c64747619531c3aa355535edbf4632) and [Yomu support](https://github.com/louismullie/treat/commit/e483b764e4847e48b39e91a77af8a8baa1a1d056)**
|
||||
|
||||
Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition. Learn more by taking a [quick tour](https://github.com/louismullie/treat/wiki/Quick-Tour) or by reading the [manual](https://github.com/louismullie/treat/wiki/Manual).
|
||||
|
||||
**Features**
|
||||
|
||||
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
|
||||
* Text retrieval with indexation and full-text search (Ferret).
|
||||
* Text chunkers, sentence segmenters, tokenizers, and parsers (Stanford & Enju).
|
||||
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
||||
* Lexical resources (WordNet interface, several POS taggers for English).
|
||||
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
||||
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
||||
* Serialization of annotated entities to YAML, XML or to MongoDB.
|
||||
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
|
||||
* Linguistic resources, including language detection and tag alignments for several treebanks.
|
||||
* Machine learning (decision tree, multilayer perceptron, linear, support vector machines).
|
||||
* Machine learning (decision tree, multilayer perceptron, LIBLINEAR, LIBSVM).
|
||||
* Text retrieval with indexation and full-text search (Ferret).
|
||||
|
||||
<br>
|
||||
**Contributing**
|
||||
|
||||
**Resources**
|
||||
I am actively seeking developers that can help maintain and expand this project. You can find a list of ideas for contributing to the project [here](https://github.com/louismullie/treat/wiki/Contributing).
|
||||
|
||||
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
||||
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
|
||||
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
|
||||
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
|
||||
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
||||
* Open an [issue](https://github.com/louismullie/treat/issues).
|
||||
|
||||
<br>
|
||||
**Authors**
|
||||
|
||||
Lead developper: @louismullie [[Twitter](https://twitter.com/LouisMullie)]
|
||||
|
||||
Contributors:
|
||||
|
||||
- @bdigital
|
||||
- @automatedtendencies
|
||||
- @LeFnord
|
||||
- @darkphantum
|
||||
- @whistlerbrk
|
||||
- @smileart
|
||||
- @erol
|
||||
|
||||
**License**
|
||||
|
||||
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
||||
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
||||
|
|
7
RELEASE
7
RELEASE
|
@ -48,4 +48,9 @@ Treat - Text Retrieval, Extraction and Annotation Toolkit
|
|||
* Added LIBSVM and LIBLINEAR classifier support.
|
||||
* Added support for serialization of documents and data sets to MongoDB.
|
||||
* Added specs for most of the core classes.
|
||||
* Several bug fixes.
|
||||
* Several bug fixes.
|
||||
|
||||
2.0.0rc1
|
||||
|
||||
* MAJOR CHANGE: the old DSL is no longer supported. A new DSL style using
|
||||
lowercase keywords is now used and must be required explicitly.
|
16
Rakefile
16
Rakefile
|
@ -40,20 +40,8 @@ namespace :treat do
|
|||
task :spec, [:language] do |t, args|
|
||||
require_relative 'spec/helper'
|
||||
Treat::Specs::Helper.start_coverage
|
||||
Treat::Specs::Helper.run_core_specs
|
||||
Treat::Specs::Helper.run_examples_as(
|
||||
'spec', args.language)
|
||||
end
|
||||
|
||||
# Runs worker benchmarks for all languages (by
|
||||
# default), or for a specific language (if supplied).
|
||||
# Also outputs an HTML table
|
||||
# Syntax: rake treat:benchmark (all languages)
|
||||
# - OR - rake treat:benchmark[language]
|
||||
task :benchmark, [:language] do |t, args|
|
||||
require_relative 'spec/helper'
|
||||
Treat::Specs::Helper.run_examples_as(
|
||||
'benchmark', args.language)
|
||||
Treat::Specs::Helper.run_library_specs
|
||||
Treat::Specs::Helper.run_language_specs(args.language)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -18,5 +18,6 @@ module Treat
|
|||
require_relative 'treat/exception'
|
||||
require_relative 'treat/autoload'
|
||||
require_relative 'treat/modules'
|
||||
require_relative 'treat/builder'
|
||||
|
||||
end
|
|
@ -14,21 +14,31 @@ module Treat::Autoload
|
|||
# Loads all the files for the base
|
||||
# module in the appropriate order.
|
||||
def self.included(base)
|
||||
# Get the parts of module name.
|
||||
bits = base.to_s.split('::')
|
||||
# Singularize the module name.
|
||||
w = bits[-1].downcase
|
||||
n = (w[-3..-1] == 'ies' ?
|
||||
(w[0..-4] + 'y') : (w[-1] ==
|
||||
's' ? w[0...-1] : w)) + '.rb'
|
||||
# Get the module's directory.
|
||||
d = File.dirname(File.
|
||||
expand_path(__FILE__))[0..-6] +
|
||||
bits.join('/').downcase + '/'
|
||||
# Require base class if exists.
|
||||
require d + n if File.readable?(d + n)
|
||||
# Require all other files in dir.
|
||||
Dir.glob("#{d}*.rb").each { |f| require f }
|
||||
m = self.get_module_name(base)
|
||||
d = self.get_module_path(m)
|
||||
n = self.singularize(m) + '.rb'
|
||||
f, p = File.join(d, n), "#{d}/*.rb"
|
||||
require f if File.readable?(f)
|
||||
Dir.glob(p).each { |f| require f }
|
||||
end
|
||||
|
||||
# Returns the path to a module's dir.
|
||||
def self.get_module_path(name)
|
||||
file = File.expand_path(__FILE__)
|
||||
dirs = File.dirname(file).split('/')
|
||||
File.join(*dirs[0..-1], name)
|
||||
end
|
||||
|
||||
# Return the downcased form of the
|
||||
# module's last name (e.g. "entities").
|
||||
def self.get_module_name(mod)
|
||||
mod.to_s.split('::')[-1].downcase
|
||||
end
|
||||
|
||||
# Helper method to singularize words.
|
||||
def self.singularize(w)
|
||||
if w[-3..-1] == 'ies'; w[0..-4] + 'y'
|
||||
else; (w[-1] == 's' ? w[0..-2] : w); end
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,6 @@
|
|||
class Treat::Builder
|
||||
include Treat::Core::DSL
|
||||
def initialize(&block)
|
||||
instance_exec(&block)
|
||||
end
|
||||
end
|
|
@ -3,71 +3,36 @@
|
|||
# the /config folder.
|
||||
module Treat::Config
|
||||
|
||||
# Require configurable mix in.
|
||||
require_relative 'importable'
|
||||
|
||||
# Make all configuration importable.
|
||||
extend Treat::Config::Importable
|
||||
|
||||
# Core configuration options for entities.
|
||||
class Treat::Config::Entities; end
|
||||
|
||||
# Configuration for paths to models, binaries,
|
||||
# temporary storage and file downloads.
|
||||
class Treat::Config::Paths; end
|
||||
|
||||
# Configuration for all Treat workers.
|
||||
class Treat::Config::Workers; end
|
||||
|
||||
# Helpful linguistic options.
|
||||
class Treat::Config::Linguistics; end
|
||||
|
||||
# Supported workers for each language.
|
||||
class Treat::Config::Languages; end
|
||||
|
||||
# Configuration options for external libraries.
|
||||
class Treat::Config::Libraries; end
|
||||
|
||||
class Treat::Config::Workers; end
|
||||
|
||||
# Configuration options for database
|
||||
# connectivity (host, port, etc.)
|
||||
class Treat::Config::Databases; end
|
||||
|
||||
# Configuration options for Treat core.
|
||||
class Treat::Config::Core; end
|
||||
|
||||
# Require autolodable mix in.
|
||||
require_relative 'configurable'
|
||||
|
||||
# Store all the configuration in self.config
|
||||
class << self; attr_accessor :config; end
|
||||
|
||||
# Setup a proxy on the main Treat module to
|
||||
# make configuration options directly accessible,
|
||||
# using e.g. Treat.paths.tmp = '...'
|
||||
Treat.module_eval do
|
||||
# Handle all missing methods as conf options.
|
||||
# Instead, should dynamically define them. FIXME.
|
||||
def self.method_missing(sym, *args, &block)
|
||||
super(sym, *args, &block) if sym == :to_ary
|
||||
Treat::Config.config[sym]
|
||||
end
|
||||
end
|
||||
|
||||
# Main function; loads all configuration options.
|
||||
def self.configure!
|
||||
config = {}
|
||||
Treat::Config.constants.each do |const|
|
||||
unless const == :Configurable
|
||||
klass = Treat::Config.const_get(const)
|
||||
klass.class_eval do
|
||||
extend Treat::Config::Configurable
|
||||
end
|
||||
k = const.to_s.downcase.intern
|
||||
klass.configure!
|
||||
config[k] = klass.config
|
||||
end
|
||||
end
|
||||
self.config = self.hash_to_struct(config)
|
||||
end
|
||||
|
||||
# * Helper methods * #
|
||||
|
||||
# Convert a hash to nested structs.
|
||||
def self.hash_to_struct(hash)
|
||||
return hash if hash.keys.
|
||||
select { |k| !k.is_a?(Symbol) }.size > 0
|
||||
struct = Struct.new(*hash.keys).new(*hash.values)
|
||||
hash.each do |key, value|
|
||||
if value.is_a?(Hash)
|
||||
struct[key] = self.hash_to_struct(value)
|
||||
end
|
||||
end; return struct
|
||||
end
|
||||
|
||||
end
|
|
@ -1,10 +1,29 @@
|
|||
# Provide default functionality to load configuration
|
||||
# options from flat files into their respective modules.
|
||||
module Treat::Config::Configurable
|
||||
|
||||
|
||||
# When extended, add the .config property to
|
||||
# the class that is being operated on.
|
||||
def self.extended(base)
|
||||
class << base; attr_accessor :config; end
|
||||
base.class_eval { self.config = {} }
|
||||
end
|
||||
|
||||
# Provide base functionality to configure
|
||||
# all modules. The behaviour is as follows:
|
||||
#
|
||||
# 1 - Check if a file named data/$CLASS$.rb
|
||||
# exists; if so, load that file as the base
|
||||
# configuration, i.e. "Treat.$CLASS$"; e.g.
|
||||
# "Treat.core"
|
||||
#
|
||||
# 2 - Check if a folder named data/$CLASS$
|
||||
# exists; if so, load each file in that folder
|
||||
# as a suboption of the main configuration,
|
||||
# i.e. "Treat.$CLASS$.$FILE$"; e.g. "Treat.workers"
|
||||
#
|
||||
# (where $CLASS$ is the lowercase name of
|
||||
# the concrete class being extended by this.)
|
||||
def configure!
|
||||
path = File.dirname(File.expand_path( # FIXME
|
||||
__FILE__)).split('/')[0..-4].join('/') + '/'
|
||||
|
@ -14,15 +33,19 @@ module Treat::Config::Configurable
|
|||
base_file = main_dir + mod_name + '.rb'
|
||||
if File.readable?(base_file)
|
||||
self.config = eval(File.read(base_file))
|
||||
end
|
||||
if FileTest.directory?(conf_dir)
|
||||
config = {}
|
||||
Dir[conf_dir + '/*'].each do |path|
|
||||
name = File.basename(path, '.*').intern
|
||||
config[name] = eval(File.read(path))
|
||||
end
|
||||
self.config = config
|
||||
elsif FileTest.directory?(conf_dir)
|
||||
self.config = self.from_dir(conf_dir)
|
||||
else; raise Treat::Exception,
|
||||
"No config file found for #{mod_name}."
|
||||
end
|
||||
end
|
||||
|
||||
# * Helper methods for configuraton * #
|
||||
def from_dir(conf_dir)
|
||||
Hash[Dir[conf_dir + '/*'].map do |path|
|
||||
name = File.basename(path, '.*').intern
|
||||
[name, eval(File.read(path))]
|
||||
end]
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
{acronyms:
|
||||
['xml', 'html', 'txt', 'odt',
|
||||
'abw', 'doc', 'yaml', 'uea',
|
||||
'lda', 'pdf', 'ptb', 'dot',
|
||||
'ai', 'id3', 'svo', 'mlp',
|
||||
'svm', 'srx'],
|
||||
|
||||
encodings:
|
||||
{language_to_code: {
|
||||
arabic: 'UTF-8',
|
||||
chinese: 'GB18030',
|
||||
english: 'UTF-8',
|
||||
french: 'ISO_8859-1',
|
||||
ferman: 'ISO_8859-1',
|
||||
hebrew: 'UTF-8'
|
||||
}},
|
||||
|
||||
entities:
|
||||
{list:
|
||||
[:entity, :unknown, :email,
|
||||
:url, :symbol, :sentence,
|
||||
:punctuation, :number,
|
||||
:enclitic, :word, :token,
|
||||
:fragment, :phrase, :paragraph,
|
||||
:title, :zone, :list, :block,
|
||||
:page, :section, :collection,
|
||||
:document],
|
||||
order:
|
||||
[:token, :fragment, :phrase,
|
||||
:sentence, :zone, :section,
|
||||
:document, :collection]},
|
||||
language: {
|
||||
default: :english,
|
||||
detect: false,
|
||||
detect_at: :document
|
||||
},
|
||||
paths: {
|
||||
description: {
|
||||
tmp: 'temporary files',
|
||||
lib: 'class and module definitions',
|
||||
bin: 'binary files',
|
||||
files: 'user-saved files',
|
||||
models: 'model files',
|
||||
spec: 'spec test files'
|
||||
}
|
||||
},
|
||||
|
||||
syntax: { sweetened: false },
|
||||
|
||||
verbosity: { debug: false, silence: true}}
|
|
@ -4,7 +4,7 @@
|
|||
'abw', 'doc', 'yaml', 'uea',
|
||||
'lda', 'pdf', 'ptb', 'dot',
|
||||
'ai', 'id3', 'svo', 'mlp',
|
||||
'svm', 'srx'],
|
||||
'svm', 'srx', 'nlp'],
|
||||
|
||||
encodings:
|
||||
{language_to_code: {
|
||||
|
@ -21,13 +21,13 @@
|
|||
[:entity, :unknown, :email,
|
||||
:url, :symbol, :sentence,
|
||||
:punctuation, :number,
|
||||
:enclitic, :word, :token,
|
||||
:enclitic, :word, :token, :group,
|
||||
:fragment, :phrase, :paragraph,
|
||||
:title, :zone, :list, :block,
|
||||
:page, :section, :collection,
|
||||
:document],
|
||||
order:
|
||||
[:token, :fragment, :phrase,
|
||||
[:token, :fragment, :group,
|
||||
:sentence, :zone, :section,
|
||||
:document, :collection]},
|
||||
language: {
|
||||
|
@ -45,7 +45,9 @@
|
|||
spec: 'spec test files'
|
||||
}
|
||||
},
|
||||
|
||||
learning: {
|
||||
list: [:data_set, :export, :feature, :tag, :problem, :question]
|
||||
},
|
||||
syntax: { sweetened: false },
|
||||
|
||||
verbosity: { debug: false, silence: true}
|
||||
|
|
|
@ -1,21 +1,12 @@
|
|||
{
|
||||
dependencies: [
|
||||
'psych',
|
||||
'nokogiri',
|
||||
'ferret',
|
||||
'bson_ext',
|
||||
'mongo',
|
||||
'lda-ruby',
|
||||
'stanford-core-nlp',
|
||||
'linguistics',
|
||||
'ruby-readability',
|
||||
'whatlanguage',
|
||||
'chronic',
|
||||
'nickel',
|
||||
'decisiontree',
|
||||
'rb-libsvm',
|
||||
'ai4r',
|
||||
'zip'
|
||||
'ferret', 'bson_ext', 'mongo', 'lda-ruby',
|
||||
'stanford-core-nlp', 'linguistics',
|
||||
'ruby-readability', 'whatlanguage',
|
||||
'chronic', 'kronic', 'nickel', 'decisiontree',
|
||||
'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
|
||||
'tf-idf-similarity', 'narray', 'fastimage',
|
||||
'fuzzy-string-match', 'levenshtein-ffi'
|
||||
],
|
||||
workers: {
|
||||
learners: {
|
||||
|
@ -25,7 +16,9 @@
|
|||
keywords: [:tf_idf],
|
||||
language: [:what_language],
|
||||
topic_words: [:lda],
|
||||
tf_idf: [:native]
|
||||
tf_idf: [:native],
|
||||
distance: [:levenshtein],
|
||||
similarity: [:jaro_winkler, :tf_idf]
|
||||
},
|
||||
formatters: {
|
||||
serializers: [:xml, :yaml, :mongo],
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
],
|
||||
workers: {
|
||||
extractors: {
|
||||
time: [:chronic, :ruby, :nickel],
|
||||
time: [:chronic, :kronic, :ruby, :nickel],
|
||||
topics: [:reuters],
|
||||
name_tag: [:stanford]
|
||||
},
|
||||
|
@ -32,28 +32,64 @@
|
|||
},
|
||||
processors: {
|
||||
parsers: [:stanford],
|
||||
segmenters: [:srx, :tactful, :punkt, :stanford, :scalpel],
|
||||
tokenizers: [:ptb, :stanford, :punkt]
|
||||
segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
|
||||
tokenizers: [:ptb, :stanford, :punkt, :open_nlp]
|
||||
}
|
||||
},
|
||||
stop_words:
|
||||
['the', 'of', 'and', 'a', 'to', 'in', 'is',
|
||||
'you', 'that', 'it', 'he', 'was', 'for', 'on',
|
||||
'are', 'as', 'with', 'his', 'they', 'I', 'at',
|
||||
'be', 'this', 'have', 'from', 'or', 'one', 'had',
|
||||
'by', 'word', 'but', 'not', 'what', 'all', 'were',
|
||||
'we', 'when', 'your', 'can', 'said', 'there', 'use',
|
||||
'an', 'each', 'which', 'she', 'do', 'how', 'their',
|
||||
'if', 'will', 'up', 'other', 'about', 'out', 'many',
|
||||
'then', 'them', 'these', 'so', 'some', 'her', 'would',
|
||||
'make', 'like', 'him', 'into', 'time', 'has', 'look',
|
||||
'two', 'more', 'write', 'go', 'see', 'number', 'no',
|
||||
'way', 'could', 'people', 'my', 'than', 'first', 'been',
|
||||
'call', 'who', 'its', 'now', 'find', 'long', 'down',
|
||||
'day', 'did', 'get', 'come', 'made', 'may', 'part',
|
||||
'say', 'also', 'new', 'much', 'should', 'still',
|
||||
'such', 'before', 'after', 'other', 'then', 'over',
|
||||
'under', 'therefore', 'nonetheless', 'thereafter',
|
||||
'afterwards', 'here', 'huh', 'hah', "n't", "'t", 'here',
|
||||
'neither', 'towards']
|
||||
[
|
||||
"about",
|
||||
"also",
|
||||
"are",
|
||||
"away",
|
||||
"because",
|
||||
"been",
|
||||
"beside",
|
||||
"besides",
|
||||
"between",
|
||||
"but",
|
||||
"cannot",
|
||||
"could",
|
||||
"did",
|
||||
"etc",
|
||||
"even",
|
||||
"ever",
|
||||
"every",
|
||||
"for",
|
||||
"had",
|
||||
"have",
|
||||
"how",
|
||||
"into",
|
||||
"isn",
|
||||
"maybe",
|
||||
"non",
|
||||
"nor",
|
||||
"now",
|
||||
"should",
|
||||
"such",
|
||||
"than",
|
||||
"that",
|
||||
"then",
|
||||
"these",
|
||||
"this",
|
||||
"those",
|
||||
"though",
|
||||
"too",
|
||||
"was",
|
||||
"wasn",
|
||||
"were",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"which",
|
||||
"while",
|
||||
"who",
|
||||
"whom",
|
||||
"whose",
|
||||
"will",
|
||||
"with",
|
||||
"would",
|
||||
"wouldn",
|
||||
"yes"
|
||||
]
|
||||
}
|
|
@ -6,13 +6,143 @@
|
|||
],
|
||||
workers: {
|
||||
processors: {
|
||||
segmenters: [:punkt],
|
||||
tokenizers: [],
|
||||
segmenters: [:scalpel],
|
||||
tokenizers: [:ptb,:stanford],
|
||||
parsers: [:stanford]
|
||||
},
|
||||
lexicalizers: {
|
||||
taggers: [:stanford],
|
||||
categorizers: [:from_tag]
|
||||
}
|
||||
}
|
||||
},
|
||||
stop_words:
|
||||
[
|
||||
"ailleurs",
|
||||
"ainsi",
|
||||
"alors",
|
||||
"aucun",
|
||||
"aucune",
|
||||
"auquel",
|
||||
"aurai",
|
||||
"auras",
|
||||
"aurez",
|
||||
"aurons",
|
||||
"auront",
|
||||
"aussi",
|
||||
"autre",
|
||||
"autres",
|
||||
"aux",
|
||||
"auxquelles",
|
||||
"auxquels",
|
||||
"avaient",
|
||||
"avais",
|
||||
"avait",
|
||||
"avec",
|
||||
"avez",
|
||||
"aviez",
|
||||
"avoir",
|
||||
"avons",
|
||||
"celui",
|
||||
"cependant",
|
||||
"certaine",
|
||||
"certaines",
|
||||
"certains",
|
||||
"ces",
|
||||
"cet",
|
||||
"cette",
|
||||
"ceux",
|
||||
"chacun",
|
||||
"chacune",
|
||||
"chaque",
|
||||
"comme",
|
||||
"constamment",
|
||||
"davantage",
|
||||
"depuis",
|
||||
"des",
|
||||
"desquelles",
|
||||
"desquels",
|
||||
"dessous",
|
||||
"dessus",
|
||||
"donc",
|
||||
"dont",
|
||||
"duquel",
|
||||
"egalement",
|
||||
"elles",
|
||||
"encore",
|
||||
"enfin",
|
||||
"ensuite",
|
||||
"etaient",
|
||||
"etais",
|
||||
"etait",
|
||||
"etes",
|
||||
"etiez",
|
||||
"etions",
|
||||
"etre",
|
||||
"eux",
|
||||
"guere",
|
||||
"ici",
|
||||
"ils",
|
||||
"jamais",
|
||||
"jusqu",
|
||||
"laquelle",
|
||||
"legerement",
|
||||
"lequel",
|
||||
"les",
|
||||
"lesquelles",
|
||||
"lesquels",
|
||||
"leur",
|
||||
"leurs",
|
||||
"lors",
|
||||
"lui",
|
||||
"maintenant",
|
||||
"mais",
|
||||
"malgre",
|
||||
"moi",
|
||||
"moins",
|
||||
"notamment",
|
||||
"parce",
|
||||
"plupart",
|
||||
"pourtant",
|
||||
"presentement",
|
||||
"presque",
|
||||
"puis",
|
||||
"puisque",
|
||||
"quand",
|
||||
"quant",
|
||||
"que",
|
||||
"quel",
|
||||
"quelqu",
|
||||
"quelque",
|
||||
"quelques",
|
||||
"qui",
|
||||
"quoi",
|
||||
"quoique",
|
||||
"rien",
|
||||
"selon",
|
||||
"serai",
|
||||
"seras",
|
||||
"serez",
|
||||
"serons",
|
||||
"seront",
|
||||
"soient",
|
||||
"soit",
|
||||
"sommes",
|
||||
"sont",
|
||||
"sous",
|
||||
"suis",
|
||||
"telle",
|
||||
"telles",
|
||||
"tels",
|
||||
"toi",
|
||||
"toujours",
|
||||
"tout",
|
||||
"toutes",
|
||||
"tres",
|
||||
"trop",
|
||||
"une",
|
||||
"vos",
|
||||
"votre",
|
||||
"vous"
|
||||
]
|
||||
|
||||
}
|
|
@ -1,3 +1,5 @@
|
|||
#encoding: UTF-8
|
||||
|
||||
{
|
||||
dependencies: [
|
||||
'punkt-segmenter',
|
||||
|
@ -6,13 +8,130 @@
|
|||
],
|
||||
workers: {
|
||||
processors: {
|
||||
segmenters: [:punkt],
|
||||
tokenizers: [],
|
||||
segmenters: [:tactful, :punkt, :stanford, :scalpel],
|
||||
tokenizers: [:stanford, :punkt],
|
||||
parsers: [:stanford]
|
||||
},
|
||||
lexicalizers: {
|
||||
taggers: [:stanford],
|
||||
categorizers: [:from_tag]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
stop_words:
|
||||
[
|
||||
"alle",
|
||||
"allem",
|
||||
"alles",
|
||||
"andere",
|
||||
"anderem",
|
||||
"anderen",
|
||||
"anderer",
|
||||
"anderes",
|
||||
"auf",
|
||||
"bei",
|
||||
"beim",
|
||||
"bist",
|
||||
"dadurch",
|
||||
"dein",
|
||||
"deine",
|
||||
"deiner",
|
||||
"deines",
|
||||
"deins",
|
||||
"dem",
|
||||
"denen",
|
||||
"der",
|
||||
"deren",
|
||||
"des",
|
||||
"deshalb",
|
||||
"dessen",
|
||||
"diese",
|
||||
"diesem",
|
||||
"diesen",
|
||||
"dieser",
|
||||
"dieses",
|
||||
"ein",
|
||||
"eine",
|
||||
"einem",
|
||||
"einen",
|
||||
"einer",
|
||||
"eines",
|
||||
"euer",
|
||||
"euere",
|
||||
"eueren",
|
||||
"eueres",
|
||||
"für",
|
||||
"haben",
|
||||
"habt",
|
||||
"hatte",
|
||||
"hatten",
|
||||
"hattest",
|
||||
"hattet",
|
||||
"hierzu",
|
||||
"hinter",
|
||||
"ich",
|
||||
"ihr",
|
||||
"ihre",
|
||||
"ihren",
|
||||
"ihrer",
|
||||
"ihres",
|
||||
"indem",
|
||||
"ist",
|
||||
"jede",
|
||||
"jedem",
|
||||
"jeden",
|
||||
"jeder",
|
||||
"jedes",
|
||||
"kann",
|
||||
"kannst",
|
||||
"können",
|
||||
"könnt",
|
||||
"konnte",
|
||||
"konnten",
|
||||
"konntest",
|
||||
"konntet",
|
||||
"mehr",
|
||||
"mein",
|
||||
"meine",
|
||||
"meiner",
|
||||
"meines",
|
||||
"meins",
|
||||
"nach",
|
||||
"neben",
|
||||
"nicht",
|
||||
"nichts",
|
||||
"seid",
|
||||
"sein",
|
||||
"seine",
|
||||
"seiner",
|
||||
"seines",
|
||||
"seins",
|
||||
"sie",
|
||||
"sind",
|
||||
"über",
|
||||
"und",
|
||||
"uns",
|
||||
"unser",
|
||||
"unsere",
|
||||
"unter",
|
||||
"vor",
|
||||
"warst",
|
||||
"weil",
|
||||
"wenn",
|
||||
"werde",
|
||||
"werden",
|
||||
"werdet",
|
||||
"willst",
|
||||
"wir",
|
||||
"wird",
|
||||
"wirst",
|
||||
"wollen",
|
||||
"wollt",
|
||||
"wollte",
|
||||
"wollten",
|
||||
"wolltest",
|
||||
"wolltet",
|
||||
"zum",
|
||||
"zur"
|
||||
]
|
||||
}
|
||||
|
||||
|
|
|
@ -8,5 +8,155 @@
|
|||
segmenters: [:punkt],
|
||||
tokenizers: []
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
stop_words:
|
||||
[
|
||||
"affinche",
|
||||
"alcun",
|
||||
"alcuna",
|
||||
"alcune",
|
||||
"alcuni",
|
||||
"alcuno",
|
||||
"allora",
|
||||
"altra",
|
||||
"altre",
|
||||
"altri",
|
||||
"altro",
|
||||
"anziche",
|
||||
"certa",
|
||||
"certe",
|
||||
"certi",
|
||||
"certo",
|
||||
"che",
|
||||
"chi",
|
||||
"chiunque",
|
||||
"comunque",
|
||||
"con",
|
||||
"cosa",
|
||||
"cose",
|
||||
"cui",
|
||||
"dagli",
|
||||
"dai",
|
||||
"dall",
|
||||
"dalla",
|
||||
"dalle",
|
||||
"darsi",
|
||||
"degli",
|
||||
"del",
|
||||
"dell",
|
||||
"della",
|
||||
"delle",
|
||||
"dello",
|
||||
"dunque",
|
||||
"egli",
|
||||
"eppure",
|
||||
"esse",
|
||||
"essi",
|
||||
"forse",
|
||||
"gia",
|
||||
"infatti",
|
||||
"inoltre",
|
||||
"invece",
|
||||
"lui",
|
||||
"malgrado",
|
||||
"mediante",
|
||||
"meno",
|
||||
"mentre",
|
||||
"mie",
|
||||
"miei",
|
||||
"mio",
|
||||
"modo",
|
||||
"molta",
|
||||
"molte",
|
||||
"molti",
|
||||
"molto",
|
||||
"negli",
|
||||
"nel",
|
||||
"nella",
|
||||
"nelle",
|
||||
"nessun",
|
||||
"nessuna",
|
||||
"nessuno",
|
||||
"niente",
|
||||
"noi",
|
||||
"nostra",
|
||||
"nostre",
|
||||
"nostri",
|
||||
"nostro",
|
||||
"nulla",
|
||||
"occorre",
|
||||
"ogni",
|
||||
"ognuno",
|
||||
"oltre",
|
||||
"oltretutto",
|
||||
"oppure",
|
||||
"ovunque",
|
||||
"ovvio",
|
||||
"percio",
|
||||
"pertanto",
|
||||
"piu",
|
||||
"piuttosto",
|
||||
"poca",
|
||||
"poco",
|
||||
"poiche",
|
||||
"propri",
|
||||
"proprie",
|
||||
"proprio",
|
||||
"puo",
|
||||
"qua",
|
||||
"qual",
|
||||
"qualche",
|
||||
"qualcuna",
|
||||
"qualcuno",
|
||||
"quale",
|
||||
"quali",
|
||||
"qualunque",
|
||||
"quando",
|
||||
"quant",
|
||||
"quante",
|
||||
"quanti",
|
||||
"quanto",
|
||||
"quantunque",
|
||||
"quegli",
|
||||
"quei",
|
||||
"quest",
|
||||
"questa",
|
||||
"queste",
|
||||
"questi",
|
||||
"questo",
|
||||
"qui",
|
||||
"quindi",
|
||||
"sebbene",
|
||||
"sembra",
|
||||
"sempre",
|
||||
"senza",
|
||||
"soltanto",
|
||||
"stessa",
|
||||
"stesse",
|
||||
"stessi",
|
||||
"stesso",
|
||||
"sugli",
|
||||
"sui",
|
||||
"sul",
|
||||
"sull",
|
||||
"sulla",
|
||||
"sulle",
|
||||
"suo",
|
||||
"suoi",
|
||||
"taluni",
|
||||
"taluno",
|
||||
"tanta",
|
||||
"tanti",
|
||||
"tanto",
|
||||
"tra",
|
||||
"tuo",
|
||||
"tuoi",
|
||||
"tutt",
|
||||
"tutta",
|
||||
"tutte",
|
||||
"tutto",
|
||||
"una",
|
||||
"uno",
|
||||
"voi"
|
||||
]
|
||||
}
|
||||
|
|
|
@ -8,5 +8,284 @@
|
|||
segmenters: [:punkt],
|
||||
tokenizers: []
|
||||
}
|
||||
}
|
||||
},
|
||||
stop_words:
|
||||
[
|
||||
"abans",
|
||||
"aca",
|
||||
"acerca",
|
||||
"ahora",
|
||||
"aixo",
|
||||
"algo",
|
||||
"algu",
|
||||
"alguien",
|
||||
"algun",
|
||||
"alguna",
|
||||
"algunas",
|
||||
"algunes",
|
||||
"alguno",
|
||||
"algunos",
|
||||
"alguns",
|
||||
"alla",
|
||||
"alli",
|
||||
"allo",
|
||||
"altra",
|
||||
"altre",
|
||||
"altres",
|
||||
"amb",
|
||||
"amunt",
|
||||
"antes",
|
||||
"aquel",
|
||||
"aquell",
|
||||
"aquella",
|
||||
"aquellas",
|
||||
"aquelles",
|
||||
"aquellos",
|
||||
"aquells",
|
||||
"aquest",
|
||||
"aquesta",
|
||||
"aquestes",
|
||||
"aquests",
|
||||
"aqui",
|
||||
"asimismo",
|
||||
"aun",
|
||||
"aunque",
|
||||
"avall",
|
||||
"cada",
|
||||
"casi",
|
||||
"com",
|
||||
"como",
|
||||
"con",
|
||||
"cosas",
|
||||
"coses",
|
||||
"cual",
|
||||
"cuales",
|
||||
"cualquier",
|
||||
"cuando",
|
||||
"damunt",
|
||||
"darrera",
|
||||
"davant",
|
||||
"debe",
|
||||
"deben",
|
||||
"deber",
|
||||
"debia",
|
||||
"debian",
|
||||
"decia",
|
||||
"decian",
|
||||
"decir",
|
||||
"deia",
|
||||
"deien",
|
||||
"del",
|
||||
"demasiado",
|
||||
"des",
|
||||
"desde",
|
||||
"despues",
|
||||
"dicen",
|
||||
"diciendo",
|
||||
"dins",
|
||||
"dir",
|
||||
"diu",
|
||||
"diuen",
|
||||
"doncs",
|
||||
"ell",
|
||||
"ellas",
|
||||
"elles",
|
||||
"ells",
|
||||
"els",
|
||||
"encara",
|
||||
"entonces",
|
||||
"ese",
|
||||
"esos",
|
||||
"esser",
|
||||
"esta",
|
||||
"estan",
|
||||
"estando",
|
||||
"estant",
|
||||
"estar",
|
||||
"estaria",
|
||||
"estarian",
|
||||
"estarien",
|
||||
"estas",
|
||||
"estos",
|
||||
"farien",
|
||||
"feia",
|
||||
"feien",
|
||||
"fent",
|
||||
"fue",
|
||||
"fueron",
|
||||
"gaire",
|
||||
"gairebe",
|
||||
"hace",
|
||||
"hacia",
|
||||
"hacian",
|
||||
"haciendo",
|
||||
"haran",
|
||||
"hauria",
|
||||
"haurien",
|
||||
"hemos",
|
||||
"hola",
|
||||
"junto",
|
||||
"lejos",
|
||||
"les",
|
||||
"lloc",
|
||||
"los",
|
||||
"menos",
|
||||
"menys",
|
||||
"meva",
|
||||
"mias",
|
||||
"mio",
|
||||
"misma",
|
||||
"mismas",
|
||||
"mismo",
|
||||
"mismos",
|
||||
"molt",
|
||||
"molta",
|
||||
"moltes",
|
||||
"mon",
|
||||
"mucha",
|
||||
"mucho",
|
||||
"muy",
|
||||
"nadie",
|
||||
"ningu",
|
||||
"nomes",
|
||||
"nosaltres",
|
||||
"nosotros",
|
||||
"nostra",
|
||||
"nostre",
|
||||
"nuestra",
|
||||
"nuestras",
|
||||
"nuestro",
|
||||
"nuestros",
|
||||
"nunca",
|
||||
"otra",
|
||||
"pasa",
|
||||
"pasan",
|
||||
"pasara",
|
||||
"pasaria",
|
||||
"passara",
|
||||
"passaria",
|
||||
"passen",
|
||||
"perque",
|
||||
"poc",
|
||||
"pocas",
|
||||
"pocos",
|
||||
"podem",
|
||||
"poden",
|
||||
"podeu",
|
||||
"podria",
|
||||
"podrian",
|
||||
"podrien",
|
||||
"poques",
|
||||
"porque",
|
||||
"potser",
|
||||
"puc",
|
||||
"pudieron",
|
||||
"pudo",
|
||||
"puede",
|
||||
"pueden",
|
||||
"puesto",
|
||||
"qualsevol",
|
||||
"quan",
|
||||
"que",
|
||||
"queria",
|
||||
"querian",
|
||||
"qui",
|
||||
"quien",
|
||||
"quienes",
|
||||
"quiere",
|
||||
"quieren",
|
||||
"quin",
|
||||
"quina",
|
||||
"quines",
|
||||
"quins",
|
||||
"quizas",
|
||||
"segueent",
|
||||
"segun",
|
||||
"sempre",
|
||||
"seran",
|
||||
"seria",
|
||||
"serian",
|
||||
"seu",
|
||||
"seva",
|
||||
"sido",
|
||||
"siempre",
|
||||
"siendo",
|
||||
"siguiente",
|
||||
"sino",
|
||||
"sobretodo",
|
||||
"solamente",
|
||||
"sovint",
|
||||
"suya",
|
||||
"suyas",
|
||||
"suyo",
|
||||
"suyos",
|
||||
"tambe",
|
||||
"tambien",
|
||||
"tanmateix",
|
||||
"tanta",
|
||||
"tanto",
|
||||
"tendran",
|
||||
"tendria",
|
||||
"tendrian",
|
||||
"tenen",
|
||||
"teu",
|
||||
"teva",
|
||||
"tiene",
|
||||
"tienen",
|
||||
"tindran",
|
||||
"tindria",
|
||||
"tindrien",
|
||||
"toda",
|
||||
"todavia",
|
||||
"todo",
|
||||
"tota",
|
||||
"totes",
|
||||
"tras",
|
||||
"traves",
|
||||
"tuvieron",
|
||||
"tuvo",
|
||||
"tuya",
|
||||
"tuyas",
|
||||
"tuyo",
|
||||
"tuyos",
|
||||
"unas",
|
||||
"unes",
|
||||
"unos",
|
||||
"uns",
|
||||
"usaba",
|
||||
"usaban",
|
||||
"usada",
|
||||
"usades",
|
||||
"usado",
|
||||
"usan",
|
||||
"usando",
|
||||
"usant",
|
||||
"usar",
|
||||
"usat",
|
||||
"usava",
|
||||
"usaven",
|
||||
"usen",
|
||||
"vaig",
|
||||
"varem",
|
||||
"varen",
|
||||
"vareu",
|
||||
"vegada",
|
||||
"vegades",
|
||||
"vez",
|
||||
"volem",
|
||||
"volen",
|
||||
"voleu",
|
||||
"vora",
|
||||
"vos",
|
||||
"vosaltres",
|
||||
"vosotros",
|
||||
"vostra",
|
||||
"vostre",
|
||||
"voy",
|
||||
"vuestra",
|
||||
"vuestras",
|
||||
"vuestro",
|
||||
"vuestros",
|
||||
"vull"
|
||||
]
|
||||
}
|
|
@ -8,5 +8,282 @@
|
|||
segmenters: [:punkt],
|
||||
tokenizers: []
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
stop_words:
|
||||
[
|
||||
"atminstone",
|
||||
"an",
|
||||
"anda",
|
||||
"aven",
|
||||
"aldrig",
|
||||
"alla",
|
||||
"alls",
|
||||
"allt",
|
||||
"alltid",
|
||||
"allting",
|
||||
"alltsa",
|
||||
"andra",
|
||||
"annan",
|
||||
"annars",
|
||||
"antingen",
|
||||
"att",
|
||||
"bakom",
|
||||
"bland",
|
||||
"blev",
|
||||
"bli",
|
||||
"bliva",
|
||||
"blivit",
|
||||
"bort",
|
||||
"bortom",
|
||||
"bredvid",
|
||||
"dar",
|
||||
"darav",
|
||||
"darefter",
|
||||
"darfor",
|
||||
"dari",
|
||||
"darigenom",
|
||||
"darvid",
|
||||
"dedar",
|
||||
"definitivt",
|
||||
"del",
|
||||
"den",
|
||||
"dendar",
|
||||
"denhar",
|
||||
"denna",
|
||||
"deras",
|
||||
"dessa",
|
||||
"dessutom",
|
||||
"desto",
|
||||
"det",
|
||||
"detta",
|
||||
"dylik",
|
||||
"efterat",
|
||||
"efter",
|
||||
"eftersom",
|
||||
"eller",
|
||||
"emellertid",
|
||||
"enbart",
|
||||
"endast",
|
||||
"enligt",
|
||||
"ens",
|
||||
"ensam",
|
||||
"envar",
|
||||
"eran",
|
||||
"etc",
|
||||
"ett",
|
||||
"exakt",
|
||||
"fatt",
|
||||
"fastan",
|
||||
"fick",
|
||||
"fler",
|
||||
"flera",
|
||||
"foljande",
|
||||
"foljde",
|
||||
"foljer",
|
||||
"for",
|
||||
"fore",
|
||||
"forhoppningsvis",
|
||||
"formodligen",
|
||||
"forr",
|
||||
"forra",
|
||||
"forutom",
|
||||
"forvisso",
|
||||
"fran",
|
||||
"framfor",
|
||||
"fullstandigt",
|
||||
"gang",
|
||||
"gar",
|
||||
"gatt",
|
||||
"ganska",
|
||||
"gav",
|
||||
"genom",
|
||||
"genomgaende",
|
||||
"ger",
|
||||
"gick",
|
||||
"gjorde",
|
||||
"gjort",
|
||||
"gor",
|
||||
"hade",
|
||||
"har",
|
||||
"harav",
|
||||
"har",
|
||||
"hej",
|
||||
"hela",
|
||||
"helst",
|
||||
"helt",
|
||||
"hitta",
|
||||
"hon",
|
||||
"honom",
|
||||
"hur",
|
||||
"huruvida",
|
||||
"huvudsakligen",
|
||||
"ibland",
|
||||
"icke",
|
||||
"ickedestomindre",
|
||||
"igen",
|
||||
"ihop",
|
||||
"inat",
|
||||
"ingen",
|
||||
"ingenstans",
|
||||
"inget",
|
||||
"innan",
|
||||
"innehalla",
|
||||
"inre",
|
||||
"inte",
|
||||
"inuti",
|
||||
"istaellet",
|
||||
"kanske",
|
||||
"klart",
|
||||
"knappast",
|
||||
"knappt",
|
||||
"kom",
|
||||
"komma",
|
||||
"kommer",
|
||||
"kraver",
|
||||
"kunde",
|
||||
"kunna",
|
||||
"lata",
|
||||
"later",
|
||||
"lagga",
|
||||
"langre",
|
||||
"laet",
|
||||
"lagd",
|
||||
"leta",
|
||||
"letar",
|
||||
"manga",
|
||||
"maste",
|
||||
"med",
|
||||
"medan",
|
||||
"medans",
|
||||
"mellan",
|
||||
"mest",
|
||||
"min",
|
||||
"mindre",
|
||||
"minst",
|
||||
"mittemellan",
|
||||
"motsvarande",
|
||||
"mycket",
|
||||
"nagon",
|
||||
"nagongang",
|
||||
"nagonsin",
|
||||
"nagonstans",
|
||||
"nagonting",
|
||||
"nagorlunda",
|
||||
"nagot",
|
||||
"namligen",
|
||||
"nar",
|
||||
"nara",
|
||||
"nasta",
|
||||
"nastan",
|
||||
"nedat",
|
||||
"nedanfor",
|
||||
"nerat",
|
||||
"ner",
|
||||
"nog",
|
||||
"normalt",
|
||||
"nummer",
|
||||
"nuvarande",
|
||||
"nytt",
|
||||
"oavsett",
|
||||
"och",
|
||||
"ocksa",
|
||||
"oppna",
|
||||
"over",
|
||||
"overallt",
|
||||
"ofta",
|
||||
"okej",
|
||||
"olika",
|
||||
"ovanfor",
|
||||
"ratt",
|
||||
"redan",
|
||||
"relativt",
|
||||
"respektive",
|
||||
"rimlig",
|
||||
"rimligen",
|
||||
"rimligt",
|
||||
"salunda",
|
||||
"savida",
|
||||
"saga",
|
||||
"sager",
|
||||
"sakert",
|
||||
"sand",
|
||||
"sarskilt",
|
||||
"satt",
|
||||
"sak",
|
||||
"samma",
|
||||
"samtliga",
|
||||
"sedd",
|
||||
"senare",
|
||||
"senaste",
|
||||
"ser",
|
||||
"sig",
|
||||
"sista",
|
||||
"sjaelv",
|
||||
"ska",
|
||||
"skall",
|
||||
"skickad",
|
||||
"skriva",
|
||||
"skulle",
|
||||
"snabb",
|
||||
"snarare",
|
||||
"snart",
|
||||
"som",
|
||||
"somliga",
|
||||
"speciellt",
|
||||
"stalla",
|
||||
"stallet",
|
||||
"starta",
|
||||
"strax",
|
||||
"stundom",
|
||||
"tackar",
|
||||
"tanka",
|
||||
"taga",
|
||||
"tagen",
|
||||
"tala",
|
||||
"tanke",
|
||||
"tidigare",
|
||||
"tills",
|
||||
"tog",
|
||||
"totalt",
|
||||
"trolig",
|
||||
"troligen",
|
||||
"tvaers",
|
||||
"tvars",
|
||||
"tycka",
|
||||
"tyckte",
|
||||
"tyvarr",
|
||||
"understundom",
|
||||
"upp",
|
||||
"uppenbarligen",
|
||||
"uppenbart",
|
||||
"utan",
|
||||
"utanfor",
|
||||
"uteslutande",
|
||||
"utom",
|
||||
"var",
|
||||
"varan",
|
||||
"vad",
|
||||
"val",
|
||||
"varde",
|
||||
"vanlig",
|
||||
"vanligen",
|
||||
"var",
|
||||
"vare",
|
||||
"varenda",
|
||||
"varfor",
|
||||
"varifran",
|
||||
"varit",
|
||||
"varje",
|
||||
"varken",
|
||||
"vars",
|
||||
"vart",
|
||||
"vem",
|
||||
"verkligen",
|
||||
"vidare",
|
||||
"vilken",
|
||||
"vill",
|
||||
"visar",
|
||||
"visst",
|
||||
"visste"
|
||||
]
|
||||
}
|
||||
|
|
|
@ -8,5 +8,9 @@
|
|||
stanford: {
|
||||
jar_path: nil,
|
||||
model_path: nil
|
||||
},
|
||||
open_nlp: {
|
||||
jar_path: nil,
|
||||
model_path: nil
|
||||
}
|
||||
}
|
|
@ -24,8 +24,9 @@
|
|||
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
||||
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
||||
'Verb phrase', ['', '', 'VP', '', '', ''],
|
||||
'Inverted yes/no question', ['', '', 'SQ', '', '', ''],
|
||||
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
||||
'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
|
||||
'Wh adverb phrase', ['', '', 'WHADVP', '', '', ''],
|
||||
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
||||
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
||||
'Unknown', ['', '', 'X', '', '', ''],
|
||||
|
@ -100,7 +101,7 @@
|
|||
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
||||
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
|
||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # FIXME
|
||||
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
||||
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
||||
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
||||
|
@ -181,7 +182,7 @@
|
|||
|
||||
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
||||
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
||||
'Punctuationm, comma', ['PUN', ',', ',', '$,'],
|
||||
'Punctuation, comma', ['PUN', ',', ',', '$,'],
|
||||
'Punctuation, dash', ['PUN', '-', '-'],
|
||||
'Punctuation, dollar sign', ['PUN', '', '$'],
|
||||
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
||||
|
@ -324,4 +325,4 @@
|
|||
['SQ', 'Inverted yes/no question']
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,5 +27,13 @@
|
|||
tf_idf: {
|
||||
type: :annotator,
|
||||
targets: [:word]
|
||||
},
|
||||
similarity: {
|
||||
type: :computer,
|
||||
targets: [:entity]
|
||||
},
|
||||
distance: {
|
||||
type: :computer,
|
||||
targets: [:entity]
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
{
|
||||
taggers: {
|
||||
type: :annotator,
|
||||
targets: [:phrase, :token]
|
||||
targets: [:group, :token],
|
||||
recursive: true
|
||||
},
|
||||
categorizers: {
|
||||
type: :annotator,
|
||||
targets: [:phrase, :token],
|
||||
targets: [:group, :token],
|
||||
recursive: true
|
||||
},
|
||||
sensers: {
|
||||
|
@ -14,5 +15,5 @@
|
|||
preset_option: :nym,
|
||||
presets: [:synonyms, :antonyms,
|
||||
:hyponyms, :hypernyms],
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
# Mixin that is extended by Treat::Config
|
||||
# in order to provide a single point of
|
||||
# access method to trigger the import.
|
||||
module Treat::Config::Importable
|
||||
|
||||
# Import relies on each configuration.
|
||||
require_relative 'configurable'
|
||||
|
||||
# Store all the configuration in self.config
|
||||
def self.extended(base)
|
||||
class << base; attr_accessor :config; end
|
||||
end
|
||||
|
||||
# Main function; loads all configuration options.
|
||||
def import!
|
||||
config, c = {}, Treat::Config::Configurable
|
||||
definition = :define_singleton_method
|
||||
Treat::Config.constants.each do |const|
|
||||
next if const.to_s.downcase.is_mixin?
|
||||
klass = Treat::Config.const_get(const)
|
||||
klass.class_eval { extend c }.configure!
|
||||
name = const.to_s.downcase.intern
|
||||
config[name] = klass.config
|
||||
Treat.send(definition, name) do
|
||||
Treat::Config.config[name]
|
||||
end
|
||||
end
|
||||
self.config = config.to_struct
|
||||
end
|
||||
|
||||
end
|
|
@ -1,9 +1,13 @@
|
|||
# Generates the following path config options:
|
||||
# Treat.paths.tmp, Treat.paths.bin, Treat.paths.lib,
|
||||
# Treat.paths.models, Treat.paths.files, Treat.paths.spec.
|
||||
class Treat::Config::Paths
|
||||
|
||||
# Get the path configuration based on the
|
||||
# directory structure loaded into Paths.
|
||||
# Note that this doesn't call super, as
|
||||
# there is no external config files to load.
|
||||
def self.configure!
|
||||
super
|
||||
root = File.dirname(File.expand_path( # FIXME
|
||||
__FILE__)).split('/')[0..-4].join('/') + '/'
|
||||
self.config = Hash[
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# Handles all configuration related
|
||||
# to understanding of part of speech
|
||||
# and phrasal tags.
|
||||
class Treat::Config::Tags
|
||||
|
||||
# Load and align tags.
|
||||
# Generate a map of word and phrase tags
|
||||
# to their syntactic category, keyed by
|
||||
# tag set.
|
||||
|
@ -16,21 +18,20 @@ class Treat::Config::Tags
|
|||
align_tags(phrase_tags, tag_sets)
|
||||
self.config[:aligned] = config
|
||||
end
|
||||
|
||||
# * Helper methods for tag set config * #
|
||||
|
||||
# Align tag tags in the tag set
|
||||
def self.align_tags(tags, tag_sets)
|
||||
wttc = {}
|
||||
tags.each_slice(2) do |desc, tags|
|
||||
category = desc.gsub(',', ' ,').
|
||||
split(' ')[0].downcase
|
||||
tag_sets.each_with_index do |tag_set, i|
|
||||
next unless tags[i]
|
||||
wttc[tags[i]] ||= {}
|
||||
wttc[tags[i]][tag_set] = category
|
||||
end
|
||||
end; return wttc
|
||||
end
|
||||
|
||||
# Helper methods for tag set config.
|
||||
# Align tag tags in the tag set
|
||||
def self.align_tags(tags, tag_sets)
|
||||
wttc = {}
|
||||
tags.each_slice(2) do |desc, tags|
|
||||
category = desc.gsub(',', ' ,').
|
||||
split(' ')[0].downcase
|
||||
tag_sets.each_with_index do |tag_set, i|
|
||||
next unless tags[i]
|
||||
wttc[tags[i]] ||= {}
|
||||
wttc[tags[i]][tag_set] = category
|
||||
end
|
||||
end; return wttc
|
||||
end
|
||||
|
||||
end
|
|
@ -1,36 +1,21 @@
|
|||
module Treat::Core::DSL
|
||||
|
||||
# Include DSL on base.
|
||||
def self.included(base)
|
||||
self.sweeten_entities(base)
|
||||
self.sweeten_learning(base)
|
||||
end
|
||||
|
||||
# Map all classes in Treat::Entities to
|
||||
# a global builder function (Entity, etc.)
|
||||
def self.sweeten_entities(base, on = true)
|
||||
Treat.core.entities.list.each do |type|
|
||||
next if type == :Symbol
|
||||
kname = type.cc.intern
|
||||
klass = Treat::Entities.const_get(kname)
|
||||
Object.class_eval do
|
||||
define_method(kname) do |val, opts={}|
|
||||
klass.build(val, opts)
|
||||
end if on
|
||||
remove_method(name) if !on
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Map all classes in the Learning module
|
||||
# to a global builder function (e.g. DataSet).
|
||||
def self.sweeten_learning(base, on = true)
|
||||
Treat::Learning.constants.each do |kname|
|
||||
Object.class_eval do
|
||||
define_method(kname) do |*args|
|
||||
Treat::Learning.const_get(kname).new(*args)
|
||||
end if on
|
||||
remove_method(name) if !on
|
||||
# a global builder function (entity, word,
|
||||
# phrase, punctuation, symbol, list, etc.)
|
||||
def self.included(base)
|
||||
def method_missing(sym,*args,&block)
|
||||
@@entities ||= Treat.core.entities.list
|
||||
@@learning ||= Treat.core.learning.list
|
||||
if @@entities.include?(sym)
|
||||
klass = Treat::Entities.const_get(sym.cc)
|
||||
return klass.build(*args)
|
||||
elsif @@learning.include?(sym)
|
||||
klass = Treat::Learning.const_get(sym.cc)
|
||||
return klass.new(*args)
|
||||
else
|
||||
super(sym,*args,&block)
|
||||
raise "Uncaught method ended up in Treat DSL."
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# A dependency manager for Treat language plugins.
|
||||
# Usage: Treat::Installer.install('language')
|
||||
module Treat::Core::Installer
|
||||
|
||||
|
||||
require 'schiphol'
|
||||
|
||||
# Address of the server with the files.
|
||||
Server = 'www.louismullie.com'
|
||||
Server = 's3.amazonaws.com/static-public-assets'
|
||||
|
||||
# Filenames for the Stanford packages.
|
||||
StanfordPackages = {
|
||||
|
@ -20,34 +20,34 @@ module Treat::Core::Installer
|
|||
:bin => File.absolute_path(Treat.paths.bin),
|
||||
:models => File.absolute_path(Treat.paths.models)
|
||||
}
|
||||
|
||||
|
||||
# Install required dependencies and optional
|
||||
# dependencies for a specific language.
|
||||
def self.install(language = 'english')
|
||||
|
||||
|
||||
# Require the Rubygem dependency installer.
|
||||
silence_warnings do
|
||||
require 'rubygems/dependency_installer'
|
||||
end
|
||||
|
||||
|
||||
@@installer = Gem::DependencyInstaller.new
|
||||
|
||||
|
||||
if language == 'travis'
|
||||
install_travis; return
|
||||
end
|
||||
|
||||
|
||||
l = "#{language.to_s.capitalize} language"
|
||||
|
||||
puts "\nTreat Installer, v. #{Treat::VERSION.to_s}\n\n"
|
||||
|
||||
|
||||
begin
|
||||
|
||||
title "Installing core dependencies."
|
||||
install_language_dependencies('agnostic')
|
||||
|
||||
|
||||
title "Installing dependencies for the #{l}.\n"
|
||||
install_language_dependencies(language)
|
||||
|
||||
|
||||
# If gem is installed only, download models.
|
||||
begin
|
||||
Gem::Specification.find_by_name('punkt-segmenter')
|
||||
|
@ -73,7 +73,7 @@ module Treat::Core::Installer
|
|||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
# Minimal install for Travis CI.
|
||||
def self.install_travis
|
||||
install_language_dependencies(:agnostic)
|
||||
|
@ -81,7 +81,7 @@ module Treat::Core::Installer
|
|||
download_stanford(:minimal)
|
||||
download_punkt_models(:english)
|
||||
end
|
||||
|
||||
|
||||
|
||||
def self.install_language_dependencies(language)
|
||||
dependencies = Treat.languages[language].dependencies
|
||||
|
@ -92,31 +92,31 @@ module Treat::Core::Installer
|
|||
end
|
||||
|
||||
def self.download_stanford(package = :minimal)
|
||||
|
||||
|
||||
f = StanfordPackages[package]
|
||||
url = "http://#{Server}/treat/#{f}"
|
||||
loc = Schiphol.download(url,
|
||||
loc = Schiphol.download(url,
|
||||
download_folder: Treat.paths.tmp
|
||||
)
|
||||
puts "- Unzipping package ..."
|
||||
dest = File.join(Treat.paths.tmp, 'stanford')
|
||||
unzip_stanford(loc, dest)
|
||||
|
||||
|
||||
model_dir = File.join(Paths[:models], 'stanford')
|
||||
bin_dir = File.join(Paths[:bin], 'stanford')
|
||||
origin = File.join(Paths[:tmp], 'stanford')
|
||||
|
||||
|
||||
# Mac hidden files fix.
|
||||
mac_remove = File.join(dest, '__MACOSX')
|
||||
if File.readable?(mac_remove)
|
||||
FileUtils.rm_rf(mac_remove)
|
||||
end
|
||||
|
||||
|
||||
unless File.readable?(bin_dir)
|
||||
puts "- Creating directory bin/stanford ..."
|
||||
FileUtils.mkdir_p(bin_dir)
|
||||
end
|
||||
|
||||
|
||||
unless File.readable?(model_dir)
|
||||
puts "- Creating directory models/stanford ..."
|
||||
FileUtils.mkdir_p(model_dir)
|
||||
|
@ -127,18 +127,18 @@ module Treat::Core::Installer
|
|||
Dir.glob(File.join(origin, '*')) do |f|
|
||||
next if ['.', '..'].include?(f)
|
||||
if f.index('jar')
|
||||
FileUtils.cp(f, File.join(Paths[:bin],
|
||||
FileUtils.cp(f, File.join(Paths[:bin],
|
||||
'stanford', File.basename(f)))
|
||||
elsif FileTest.directory?(f)
|
||||
FileUtils.cp_r(f, model_dir)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
puts "- Cleaning up..."
|
||||
FileUtils.rm_rf(origin)
|
||||
|
||||
|
||||
'Done.'
|
||||
|
||||
|
||||
end
|
||||
|
||||
def self.download_punkt_models(language)
|
||||
|
@ -146,7 +146,7 @@ module Treat::Core::Installer
|
|||
f = "#{language}.yaml"
|
||||
dest = "#{Treat.paths.models}punkt/"
|
||||
url = "http://#{Server}/treat/punkt/#{f}"
|
||||
loc = Schiphol.download(url,
|
||||
loc = Schiphol.download(url,
|
||||
download_folder: Treat.paths.tmp
|
||||
)
|
||||
unless File.readable?(dest)
|
||||
|
@ -156,7 +156,7 @@ module Treat::Core::Installer
|
|||
|
||||
puts "- Copying model file to models/punkt ..."
|
||||
FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
|
||||
|
||||
|
||||
puts "- Cleaning up..."
|
||||
FileUtils.rm_rf(Paths[:tmp] + Server)
|
||||
|
||||
|
@ -181,12 +181,11 @@ module Treat::Core::Installer
|
|||
begin
|
||||
puts "Installing #{dependency}...\n"
|
||||
@@installer.install(dependency)
|
||||
rescue Exception => error
|
||||
raise
|
||||
puts "Couldn't install gem '#{dependency}' " +
|
||||
"(#{error.message})."
|
||||
rescue Gem::InstallError => error
|
||||
puts "Warning: couldn't install " +
|
||||
"gem '#{dependency}' (#{error.message})."
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
# Unzip a file to the destination path.
|
||||
|
@ -194,7 +193,7 @@ module Treat::Core::Installer
|
|||
|
||||
require 'zip/zip'
|
||||
f_path = ''
|
||||
|
||||
|
||||
Zip::ZipFile.open(file) do |zip_file|
|
||||
zip_file.each do |f|
|
||||
f_path = File.join(destination, f.name)
|
||||
|
|
|
@ -3,6 +3,7 @@ class Treat::Core::Server
|
|||
# Refer to http://rack.rubyforge.org/doc/classes/Rack/Server.html
|
||||
# for possible options to configure.
|
||||
def initialize(handler = 'thin', options = {})
|
||||
raise "Implementation not finished."
|
||||
require 'json'; require 'rack'
|
||||
@handler, @options = handler.capitalize, options
|
||||
end
|
||||
|
|
|
@ -4,6 +4,7 @@ module Treat::Entities
|
|||
|
||||
# Represents a collection.
|
||||
class Collection < Entity; end
|
||||
|
||||
# Represents a document.
|
||||
class Document < Entity; end
|
||||
|
||||
|
@ -18,6 +19,9 @@ module Treat::Entities
|
|||
# Represents a block of text
|
||||
class Block < Section; end
|
||||
|
||||
# Represents a list.
|
||||
class List < Section; end
|
||||
|
||||
# * Zones and related classes * #
|
||||
|
||||
# Represents a zone of text.
|
||||
|
@ -31,9 +35,6 @@ module Treat::Entities
|
|||
# of sentences and/or phrases).
|
||||
class Paragraph < Zone; end
|
||||
|
||||
# Represents a list.
|
||||
class List < Zone; end
|
||||
|
||||
# * Groups and related classes * #
|
||||
|
||||
# Represents a group of tokens.
|
||||
|
|
|
@ -22,7 +22,9 @@ module Treat::Entities
|
|||
attr_accessor :type
|
||||
|
||||
# Autoload all the classes in /abilities.
|
||||
include Treat::Autoload
|
||||
path = File.expand_path(__FILE__)
|
||||
patt = File.dirname(path) + '/entity/*.rb'
|
||||
Dir.glob(patt).each { |f| require f }
|
||||
|
||||
# Implements support for #register, #registry.
|
||||
include Registrable
|
||||
|
@ -82,8 +84,11 @@ module Treat::Entities
|
|||
#
|
||||
# Takes in a single entity or an array of
|
||||
# entities. Returns the first child supplied.
|
||||
# @see Treat::Registrable
|
||||
# If a string is
|
||||
def <<(entities, clear_parent = true)
|
||||
entities = (entities.is_a?(::String) ||
|
||||
entities.is_a?(::Numeric)) ?
|
||||
entities.to_entity : entities
|
||||
entities = entities.is_a?(::Array) ?
|
||||
entities : [entities]
|
||||
# Register each entity in this node.
|
||||
|
@ -121,7 +126,7 @@ module Treat::Entities
|
|||
# requested method does not exist. Also
|
||||
# provides suggestions for misspellings.
|
||||
def invalid_call(sym)
|
||||
msg = Treat::Workers::Category.lookup(sym) ?
|
||||
msg = Treat::Workers.lookup(sym) ?
|
||||
"Method #{sym} can't be called on a #{type}." :
|
||||
"Method #{sym} is not defined by Treat." +
|
||||
Treat::Helpers::Help.did_you_mean?(
|
||||
|
|
|
@ -57,7 +57,7 @@ module Treat::Entities::Entity::Applicable
|
|||
|
||||
# Get the group of a task.
|
||||
def get_group(task)
|
||||
g = Treat::Workers::Category.lookup(task)
|
||||
g = Treat::Workers.lookup(task)
|
||||
unless g
|
||||
raise Treat::Exception,
|
||||
"Task #{task} does not exist."
|
||||
|
|
|
@ -15,7 +15,21 @@ module Treat::Entities::Entity::Buildable
|
|||
PunctRegexp = /^[[:punct:]\$]+$/
|
||||
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
||||
EmailRegexp = /.+\@.+\..+/
|
||||
Enclitics = %w['ll 'm 're 's 't 've]
|
||||
Enclitics = [
|
||||
# EXAMPLE:
|
||||
"'d", # I'd => I would
|
||||
"'ll", # I'll => I will
|
||||
"'m", # I'm => I am
|
||||
"'re", # We're => We are
|
||||
"'s", # There's => There is
|
||||
# Let's => Let us
|
||||
"'t", # 'Twas => Archaic ('Twas the night)
|
||||
"'ve", # They've => They have
|
||||
"n't" # Can't => Can not
|
||||
]
|
||||
|
||||
# Accepted formats of serialized files
|
||||
AcceptedFormats = ['.xml', '.yml', '.yaml', '.mongo']
|
||||
|
||||
# Reserved folder names
|
||||
Reserved = ['.index']
|
||||
|
@ -23,23 +37,38 @@ module Treat::Entities::Entity::Buildable
|
|||
# Build an entity from anything (can be
|
||||
# a string, numeric,folder, or file name
|
||||
# representing a raw or serialized file).
|
||||
def build(file_or_value, options = {})
|
||||
def build(*args)
|
||||
|
||||
# This probably needs some doc.
|
||||
if args.size == 0
|
||||
file_or_value = ''
|
||||
elsif args[0].is_a?(Hash)
|
||||
file_or_value = args[0]
|
||||
elsif args.size == 1
|
||||
if args[0].is_a?(Treat::Entities::Entity)
|
||||
args[0] = [args[0]]
|
||||
end
|
||||
file_or_value = args[0]
|
||||
else
|
||||
file_or_value = args
|
||||
end
|
||||
|
||||
fv = file_or_value.to_s
|
||||
|
||||
if file_or_value.is_a?(Hash)
|
||||
if fv == ''; self.new
|
||||
elsif file_or_value.is_a?(Array)
|
||||
from_array(file_or_value)
|
||||
elsif file_or_value.is_a?(Hash)
|
||||
from_db(file_or_value)
|
||||
elsif self == Treat::Entities::Document ||
|
||||
(fv.index('yml') || fv.index('yaml') ||
|
||||
fv.index('xml') || fv.index('mongo'))
|
||||
elsif self == Treat::Entities::Document || (is_serialized_file?(fv))
|
||||
if fv =~ UriRegexp
|
||||
from_url(fv, options)
|
||||
from_url(fv)
|
||||
else
|
||||
from_file(fv, options)
|
||||
from_file(fv)
|
||||
end
|
||||
elsif self == Treat::Entities::Collection
|
||||
if FileTest.directory?(fv)
|
||||
from_folder(fv, options)
|
||||
from_folder(fv)
|
||||
else
|
||||
create_collection(fv)
|
||||
end
|
||||
|
@ -78,8 +107,19 @@ module Treat::Entities::Entity::Buildable
|
|||
e
|
||||
end
|
||||
|
||||
# Build a document from an array
|
||||
# of builders.
|
||||
def from_array(array)
|
||||
obj = self.new
|
||||
array.each do |el|
|
||||
el = el.to_entity unless el.is_a?(Treat::Entities::Entity)
|
||||
obj << el
|
||||
end
|
||||
obj
|
||||
end
|
||||
|
||||
# Build a document from an URL.
|
||||
def from_url(url, options)
|
||||
def from_url(url)
|
||||
unless self ==
|
||||
Treat::Entities::Document
|
||||
raise Treat::Exception,
|
||||
|
@ -88,8 +128,12 @@ module Treat::Entities::Entity::Buildable
|
|||
end
|
||||
|
||||
begin
|
||||
folder = Treat.paths.files
|
||||
if folder[-1] == '/'
|
||||
folder = folder[0..-2]
|
||||
end
|
||||
f = Schiphol.download(url,
|
||||
download_folder: Treat.paths.files,
|
||||
download_folder: folder,
|
||||
show_progress: !Treat.core.verbosity.silence,
|
||||
rectify_extensions: true,
|
||||
max_tries: 3)
|
||||
|
@ -97,10 +141,8 @@ module Treat::Entities::Entity::Buildable
|
|||
raise Treat::Exception,
|
||||
"Couldn't download file at #{url}."
|
||||
end
|
||||
|
||||
options[:default_to] ||= 'html'
|
||||
|
||||
e = from_file(f, options)
|
||||
e = from_file(f,'html')
|
||||
e.set :url, url.to_s
|
||||
e
|
||||
|
||||
|
@ -123,7 +165,7 @@ module Treat::Entities::Entity::Buildable
|
|||
|
||||
# Build an entity from a folder with documents.
|
||||
# Folders will be searched recursively.
|
||||
def from_folder(folder, options)
|
||||
def from_folder(folder)
|
||||
|
||||
return if Reserved.include?(folder)
|
||||
|
||||
|
@ -148,49 +190,44 @@ module Treat::Entities::Entity::Buildable
|
|||
|
||||
c = Treat::Entities::Collection.new(folder)
|
||||
folder += '/' unless folder[-1] == '/'
|
||||
|
||||
|
||||
if !FileTest.directory?(folder)
|
||||
FileUtils.mkdir(folder)
|
||||
end
|
||||
|
||||
|
||||
c.set :folder, folder
|
||||
i = folder + '/.index'
|
||||
c.set :index, i if FileTest.directory?(i)
|
||||
|
||||
|
||||
Dir[folder + '*'].each do |f|
|
||||
if FileTest.directory?(f)
|
||||
c2 = Treat::Entities::Collection.
|
||||
from_folder(f, options)
|
||||
from_folder(f)
|
||||
c.<<(c2, false) if c2
|
||||
else
|
||||
c.<<(Treat::Entities::Document.
|
||||
from_file(f, options), false)
|
||||
from_file(f), false)
|
||||
end
|
||||
end
|
||||
|
||||
c
|
||||
|
||||
return c
|
||||
|
||||
end
|
||||
|
||||
# Build a document from a raw or serialized file.
|
||||
def from_file(file, options)
|
||||
def from_file(file,def_fmt=nil)
|
||||
|
||||
if file.index('yml') ||
|
||||
file.index('yaml') ||
|
||||
file.index('xml') ||
|
||||
file.index('mongo')
|
||||
from_serialized_file(file, options)
|
||||
if is_serialized_file?(file)
|
||||
from_serialized_file(file)
|
||||
else
|
||||
fmt = Treat::Workers::Formatters::Readers::Autoselect.
|
||||
detect_format(file, options[:default_to])
|
||||
options[:_format] = fmt
|
||||
from_raw_file(file, options)
|
||||
fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
|
||||
from_raw_file(file, fmt)
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
# Build a document from a raw file.
|
||||
def from_raw_file(file, options)
|
||||
def from_raw_file(file, def_fmt='txt')
|
||||
|
||||
unless self ==
|
||||
Treat::Entities::Document
|
||||
|
@ -204,7 +241,7 @@ module Treat::Entities::Entity::Buildable
|
|||
"Path '#{file}' does not "+
|
||||
"point to a readable file."
|
||||
end
|
||||
|
||||
options = {default_format: def_fmt}
|
||||
d = Treat::Entities::Document.new
|
||||
d.set :file, file
|
||||
d.read(:autoselect, options)
|
||||
|
@ -212,34 +249,32 @@ module Treat::Entities::Entity::Buildable
|
|||
end
|
||||
|
||||
# Build an entity from a serialized file.
|
||||
def from_serialized_file(file, options)
|
||||
def from_serialized_file(file)
|
||||
|
||||
if file.index('mongo')
|
||||
options[:id] = file.scan( # Consolidate this
|
||||
/([0-9]+)\.mongo/).first.first
|
||||
from_db(:mongo, options)
|
||||
else
|
||||
unless File.readable?(file)
|
||||
raise Treat::Exception,
|
||||
"Path '#{file}' does not "+
|
||||
"point to a readable file."
|
||||
end
|
||||
doc = Treat::Entities::Document.new
|
||||
doc.set :file, file
|
||||
format = nil
|
||||
if file.index('yml') || file.index('yaml')
|
||||
format = :yaml
|
||||
elsif file.index('xml')
|
||||
f = :xml
|
||||
else
|
||||
raise Treat::Exception,
|
||||
"Unreadable serialized format for #{file}."
|
||||
end
|
||||
doc.unserialize(format, options)
|
||||
doc.children[0].set_as_root! # Fix this
|
||||
doc.children[0]
|
||||
unless File.readable?(file)
|
||||
raise Treat::Exception,
|
||||
"Path '#{file}' does not "+
|
||||
"point to a readable file."
|
||||
end
|
||||
doc = Treat::Entities::Document.new
|
||||
doc.set :file, file
|
||||
format = nil
|
||||
if File.extname(file) == '.yml' ||
|
||||
File.extname(file) == '.yaml'
|
||||
format = :yaml
|
||||
elsif File.extname(file) == '.xml'
|
||||
format = :xml
|
||||
else
|
||||
raise Treat::Exception,
|
||||
"Unreadable serialized format for #{file}."
|
||||
end
|
||||
doc.unserialize(format)
|
||||
doc.children[0].set_as_root! # Fix this
|
||||
doc.children[0]
|
||||
end
|
||||
|
||||
def is_serialized_file?(path_to_check)
|
||||
(AcceptedFormats.include? File.extname(path_to_check)) && (File.file?(path_to_check))
|
||||
end
|
||||
|
||||
def from_db(hash)
|
||||
|
@ -258,9 +293,23 @@ module Treat::Entities::Entity::Buildable
|
|||
# Build any kind of entity from a string.
|
||||
def anything_from_string(string)
|
||||
case self.mn.downcase.intern
|
||||
when :document, :collection
|
||||
when :document
|
||||
folder = Treat.paths.files
|
||||
if folder[-1] == '/'
|
||||
folder = folder[0..-2]
|
||||
end
|
||||
|
||||
now = Time.now.to_f
|
||||
doc_file = folder+ "/#{now}.txt"
|
||||
string.force_encoding('UTF-8')
|
||||
File.open(doc_file, 'w') do |f|
|
||||
f.puts string
|
||||
end
|
||||
|
||||
from_raw_file(doc_file)
|
||||
when :collection
|
||||
raise Treat::Exception,
|
||||
"Cannot create a document or " +
|
||||
"Cannot create a " +
|
||||
"collection from a string " +
|
||||
"(need a readable file/folder)."
|
||||
when :phrase
|
||||
|
@ -287,6 +336,7 @@ module Treat::Entities::Entity::Buildable
|
|||
|
||||
end
|
||||
|
||||
# This should be improved on.
|
||||
def check_encoding(string)
|
||||
string.encode("UTF-8", undef: :replace) # Fix
|
||||
end
|
||||
|
@ -346,7 +396,7 @@ module Treat::Entities::Entity::Buildable
|
|||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
def create_collection(fv)
|
||||
FileUtils.mkdir(fv)
|
||||
Treat::Entities::Collection.new(fv)
|
||||
|
|
|
@ -11,8 +11,8 @@ module Treat::Entities::Entity::Checkable
|
|||
return @features[feature] if has?(feature)
|
||||
return send(feature) if do_it
|
||||
task = caller_method(2) # This is dangerous !
|
||||
g1 = Treat::Workers::Category.lookup(task)
|
||||
g2 = Treat::Workers::Category.lookup(feature)
|
||||
g1 = Treat::Workers.lookup(task)
|
||||
g2 = Treat::Workers.lookup(feature)
|
||||
|
||||
raise Treat::Exception,
|
||||
"#{g1.type.to_s.capitalize} " +
|
||||
|
|
|
@ -41,6 +41,7 @@ module Treat::Entities::Entity::Countable
|
|||
# Returns the frequency of the given value
|
||||
# in the this entity.
|
||||
def frequency_of(value)
|
||||
value = value.downcase
|
||||
if is_a?(Treat::Entities::Token)
|
||||
raise Treat::Exception,
|
||||
"Cannot get the frequency " +
|
||||
|
|
|
@ -3,67 +3,64 @@
|
|||
# printed by the #print_debug function.
|
||||
module Treat::Entities::Entity::Debuggable
|
||||
|
||||
@@prev = nil
|
||||
@@i = 0
|
||||
# Previous state and counter.
|
||||
@@prev, @@i = nil, 0
|
||||
|
||||
# Explains what Treat is currently doing.
|
||||
# Fixme: last call will never get shown.
|
||||
def print_debug(entity, task, worker, group, options)
|
||||
|
||||
targs = group.targets.map do |target|
|
||||
target.to_s
|
||||
end
|
||||
# Get a list of the worker's targets.
|
||||
targets = group.targets.map(&:to_s)
|
||||
|
||||
if targs.size == 1
|
||||
t = targs[0]
|
||||
else
|
||||
t = targs[0..-2].join(', ') +
|
||||
' and/or ' + targs[-1]
|
||||
end
|
||||
# List the worker's targets as either
|
||||
# a single target or an and/or form
|
||||
# (since it would be too costly to
|
||||
# actually determine what target types
|
||||
# were processed at runtime for each call).
|
||||
t = targets.size == 1 ? targets[0] : targets[
|
||||
0..-2].join(', ') + ' and/or ' + targets[-1]
|
||||
|
||||
# Add genitive for annotations (sing./plural)
|
||||
genitive = targets.size > 1 ? 'their' : 'its'
|
||||
|
||||
# Set up an empty string and humanize task name.
|
||||
doing, human_task = '', task.to_s.gsub('_', ' ')
|
||||
|
||||
genitive = targs.size > 1 ?
|
||||
'their' : 'its'
|
||||
|
||||
doing = ''
|
||||
|
||||
human_task = task.to_s.gsub('_', ' ')
|
||||
|
||||
if group.type == :transformer ||
|
||||
group.type == :computer
|
||||
|
||||
# Base is "{task}-ed {a(n)|N} {target(s)}"
|
||||
if [:transformer, :computer].include?(group.type)
|
||||
tt = human_task
|
||||
tt = tt[0..-2] if tt[-1] == 'e'
|
||||
ed = tt[-1] == 'd' ? '' : 'ed'
|
||||
doing = "#{tt.capitalize}#{ed} #{t}"
|
||||
|
||||
# Base is "Annotated {a(n)|N} {target(s)}"
|
||||
elsif group.type == :annotator
|
||||
|
||||
if group.preset_option
|
||||
opt = options[group.preset_option]
|
||||
form = opt.to_s.gsub('_', ' ')
|
||||
human_task[-1] = ''
|
||||
human_task = form + ' ' + human_task
|
||||
end
|
||||
|
||||
doing = "Annotated #{t} with " +
|
||||
"#{genitive} #{human_task}"
|
||||
end
|
||||
|
||||
# Form is '{base} in format {worker}'.
|
||||
if group.to_s.index('Formatters')
|
||||
curr = doing +
|
||||
' in format ' +
|
||||
worker.to_s
|
||||
curr = doing + ' in format ' + worker.to_s
|
||||
# Form is '{base} using {worker}'.
|
||||
else
|
||||
curr = doing +
|
||||
' using ' +
|
||||
worker.to_s.gsub('_', ' ')
|
||||
curr = doing + ' using ' + worker.to_s.gsub('_', ' ')
|
||||
end
|
||||
|
||||
# Remove any double pluralization that may happen.
|
||||
curr.gsub!('ss', 's') unless curr.index('class')
|
||||
curr += '.'
|
||||
|
||||
if curr == @@prev
|
||||
@@i += 1
|
||||
else
|
||||
# Accumulate repeated tasks.
|
||||
@@i += 1 if curr == @@prev
|
||||
|
||||
# Change tasks, so output.
|
||||
if curr != @@prev && @@prev
|
||||
# Pluralize entity names if necessary.
|
||||
if @@i > 1
|
||||
Treat.core.entities.list.each do |e|
|
||||
@@prev.gsub!(e.to_s, e.to_s + 's')
|
||||
|
@ -71,9 +68,15 @@ module Treat::Entities::Entity::Debuggable
|
|||
@@prev.gsub!('its', 'their')
|
||||
@@prev = @@prev.split(' ').
|
||||
insert(1, @@i.to_s).join(' ')
|
||||
# Add determiner if singular.
|
||||
else
|
||||
@@prev = @@prev.split(' ').
|
||||
insert(1, 'a').join(' ')
|
||||
end
|
||||
# Reset counter.
|
||||
@@i = 0
|
||||
puts @@prev # Last call doesn't get shown.
|
||||
# Write to stdout.
|
||||
puts @@prev + '.'
|
||||
end
|
||||
|
||||
@@prev = curr
|
||||
|
|
|
@ -88,7 +88,6 @@ module Treat::Entities::Entity::Delegatable
|
|||
# Get the default worker for that language
|
||||
# inside the given group.
|
||||
def find_worker_for_language(language, group)
|
||||
|
||||
lang = Treat.languages[language]
|
||||
cat = group.to_s.split('::')[2].downcase.intern
|
||||
group = group.mn.ucc.intern
|
||||
|
@ -96,31 +95,25 @@ module Treat::Entities::Entity::Delegatable
|
|||
raise Treat::Exception,
|
||||
"No configuration file loaded for language #{language}."
|
||||
end
|
||||
|
||||
workers = lang.workers
|
||||
|
||||
if !workers.respond_to?(cat) ||
|
||||
!workers[cat].respond_to?(group)
|
||||
workers = Treat.languages.agnostic.workers
|
||||
end
|
||||
|
||||
if !workers.respond_to?(cat) ||
|
||||
!workers[cat].respond_to?(group)
|
||||
raise Treat::Exception,
|
||||
"No #{group} is/are available for the " +
|
||||
"#{language.to_s.capitalize} language."
|
||||
end
|
||||
|
||||
|
||||
workers[cat][group].first
|
||||
|
||||
end
|
||||
|
||||
# Return an error message and suggest possible typos.
|
||||
def worker_not_found(klass, group)
|
||||
"Algorithm '#{klass.mn.ucc}' couldn't be "+
|
||||
def worker_not_found(worker, group)
|
||||
"Worker with name '#{worker}' couldn't be "+
|
||||
"found in group #{group}." + Treat::Helpers::Help.
|
||||
did_you_mean?(group.list.map { |c| c.ucc }, klass.ucc)
|
||||
did_you_mean?(group.list.map { |c| c.ucc }, worker)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -105,18 +105,6 @@ module Treat::Entities::Entity::Iterable
|
|||
end
|
||||
i
|
||||
end
|
||||
|
||||
# Return the first element in the array, warning if not
|
||||
# the only one in the array. Used for magic methods: e.g.,
|
||||
# the magic method "word" if called on a sentence with many
|
||||
# words, Treat will return the first word, but warn the user.
|
||||
def first_but_warn(array, type)
|
||||
if array.size > 1
|
||||
warn "Warning: requested one #{type}, but" +
|
||||
" there are many #{type}s in this entity."
|
||||
end
|
||||
array[0]
|
||||
end
|
||||
|
||||
|
||||
end
|
|
@ -78,5 +78,16 @@ module Treat::Entities::Entity::Magical
|
|||
|
||||
end
|
||||
|
||||
# Return the first element in the array, warning if not
|
||||
# the only one in the array. Used for magic methods: e.g.,
|
||||
# the magic method "word" if called on a sentence with many
|
||||
# words, Treat will return the first word, but warn the user.
|
||||
def first_but_warn(array, type)
|
||||
if array.size > 1
|
||||
warn "Warning: requested one #{type}, but" +
|
||||
" there are many #{type}s in this entity."
|
||||
end
|
||||
array[0]
|
||||
end
|
||||
|
||||
end
|
|
@ -6,6 +6,12 @@ module Treat::Entities::Entity::Stringable
|
|||
# Returns the entity's true string value.
|
||||
def to_string; @value.dup; end
|
||||
|
||||
# Returns an array of the childrens' string
|
||||
# values, found by calling #to_s on them.
|
||||
def to_a; @children.map { |c| c.to_s }; end
|
||||
|
||||
alias :to_ary :to_a
|
||||
|
||||
# Returns the entity's string value by
|
||||
# imploding the value of all terminal
|
||||
# entities in the subtree of that entity.
|
||||
|
@ -52,16 +58,14 @@ module Treat::Entities::Entity::Stringable
|
|||
end
|
||||
|
||||
# Helper method to implode the string value of the subtree.
|
||||
def implode
|
||||
def implode(value = "")
|
||||
|
||||
return @value.dup if !has_children?
|
||||
|
||||
value = ''
|
||||
|
||||
each do |child|
|
||||
|
||||
if child.is_a?(Treat::Entities::Section)
|
||||
value += "\n\n"
|
||||
value << "\n\n"
|
||||
end
|
||||
|
||||
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
||||
|
@ -69,14 +73,14 @@ module Treat::Entities::Entity::Stringable
|
|||
child.is_a?(Treat::Entities::Enclitic)
|
||||
value.strip!
|
||||
end
|
||||
value += child.to_s + ' '
|
||||
value << child.to_s + ' '
|
||||
else
|
||||
value += child.implode
|
||||
child.implode(value)
|
||||
end
|
||||
|
||||
if child.is_a?(Treat::Entities::Title) ||
|
||||
child.is_a?(Treat::Entities::Paragraph)
|
||||
value += "\n\n"
|
||||
value << "\n\n"
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -1,18 +1,29 @@
|
|||
# Helper methods to manipulate hashes.
|
||||
class Treat::Helpers::Hash
|
||||
|
||||
# Allow getting the caller method in any context.
|
||||
Hash.class_eval do
|
||||
# Mixin to allow conversion of hashes to
|
||||
# nested structs with the keys as attributes.
|
||||
module ToStruct
|
||||
# Converts a hash to nested structs.
|
||||
def self.hash_to_struct(hash)
|
||||
return hash if hash.keys.
|
||||
select { |k| !k.is_a?(Symbol) }.size > 0
|
||||
struct = Struct.new(*hash.keys).new(*hash.values)
|
||||
def to_struct
|
||||
hash = self
|
||||
symbols = hash.keys.select { |k|
|
||||
!k.is_a?(Symbol) }.size
|
||||
return hash if symbols > 0
|
||||
klass = Struct.new(*hash.keys)
|
||||
struct = klass.new(*hash.values)
|
||||
hash.each do |key, value|
|
||||
if value.is_a?(Hash)
|
||||
struct[key] = self.hash_to_struct(value)
|
||||
v = value.to_struct
|
||||
struct[key] = v
|
||||
end
|
||||
end; return struct
|
||||
end
|
||||
end
|
||||
|
||||
# Include the mixins on the core Hash class.
|
||||
Hash.class_eval do
|
||||
include Treat::Helpers::Hash::ToStruct
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -32,30 +32,4 @@ class Treat::Helpers::Help
|
|||
msg
|
||||
end
|
||||
|
||||
# Return the levensthein distance between
|
||||
# two strings taking into account the costs
|
||||
# of insertion, deletion, and substitution.
|
||||
# Used by did_you_mean? to detect typos.
|
||||
def self.levenshtein(first, other, ins=1, del=1, sub=1)
|
||||
return nil if first.nil? || other.nil?
|
||||
dm = []
|
||||
dm[0] = (0..first.length).collect { |i| i * ins}
|
||||
fill = [0] * (first.length - 1).abs
|
||||
for i in 1..other.length
|
||||
dm[i] = [i * del, fill.flatten]
|
||||
end
|
||||
for i in 1..other.length
|
||||
for j in 1..first.length
|
||||
dm[i][j] = [
|
||||
dm[i-1][j-1] +
|
||||
(first[i-1] ==
|
||||
other[i-1] ? 0 : sub),
|
||||
dm[i][j-1] + ins,
|
||||
dm[i-1][j] + del
|
||||
].min
|
||||
end
|
||||
end
|
||||
dm[other.length][first.length]
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -4,46 +4,40 @@ class Treat::Helpers::Object
|
|||
# Allow introspection onto what method called
|
||||
# another one at runtime (useful for debugging).
|
||||
module CallerMethod
|
||||
|
||||
# Pattern to match method from trace.
|
||||
CMPattern = /^(.+?):(\d+)(?::in `(.*)')?/
|
||||
# Return the name of the method that
|
||||
# called the method that calls this method.
|
||||
def caller_method(n = 3)
|
||||
at = caller(n).first
|
||||
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
||||
CMPattern =~ at
|
||||
Regexp.last_match[3].
|
||||
gsub('block in ', '').intern
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
# Retrieve the last name of a class/module
|
||||
# (i.e. the part after the last "::").
|
||||
module ModuleName
|
||||
|
||||
def module_name; self.to_s.split('::')[-1]; end
|
||||
alias :mn :module_name
|
||||
|
||||
end
|
||||
|
||||
module Verbosity
|
||||
# Runs a block of code without warnings.
|
||||
def silence_warnings(&block)
|
||||
warn_level = $VERBOSE
|
||||
$VERBOSE = nil
|
||||
result = block.call
|
||||
$VERBOSE = warn_level
|
||||
warn_level = $VERBOSE; $VERBOSE = nil
|
||||
result = block.call; $VERBOSE = warn_level
|
||||
result
|
||||
end
|
||||
|
||||
# Runs a block of code while blocking stdout.
|
||||
def silence_stdout(log = '/dev/null')
|
||||
unless Treat.core.verbosity.silence
|
||||
yield; return
|
||||
end
|
||||
old = $stdout.dup
|
||||
$stdout.reopen(File.new(log, 'w'))
|
||||
yield
|
||||
$stdout = old
|
||||
file, old, ret = File.new(log, 'w'),
|
||||
$stdout.dup, nil; $stdout.reopen(file)
|
||||
ret = yield; $stdout = old; return ret
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ class Treat::Helpers::String
|
|||
if @@cc_cache[o_phrase]
|
||||
return @@cc_cache[o_phrase]
|
||||
end
|
||||
if Treat.core.acronyms.include?(phrase)
|
||||
if Treat.core.acronyms.include?(phrase.downcase)
|
||||
phrase = phrase.upcase
|
||||
else
|
||||
phrase.gsub!(Regex) { |a| a.upcase }
|
||||
|
@ -99,12 +99,19 @@ class Treat::Helpers::String
|
|||
|
||||
end
|
||||
|
||||
# Determines whether module is
|
||||
# an "-able" mixin kind of thing.
|
||||
module IsMixin
|
||||
def is_mixin?; to_s[-4..-1] == 'able'; end
|
||||
end
|
||||
|
||||
# Graft the helpers onto the string module.
|
||||
String.class_eval do
|
||||
include Treat::Helpers::String::CamelCaseable
|
||||
include Treat::Helpers::String::UnCamelCaseable
|
||||
include Treat::Helpers::String::Escapable
|
||||
include Treat::Helpers::String::Unescapable
|
||||
include Treat::Helpers::String::IsMixin
|
||||
end
|
||||
|
||||
# Graft camel casing onto symbols.
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
# Handles the verbosity for external
|
||||
# programs (gems, binaries, etc.)
|
||||
module Treat::Helpers::Verbosity
|
||||
|
||||
|
||||
|
||||
end
|
|
@ -63,7 +63,7 @@ class Treat::Learning::Problem
|
|||
# all of the features.
|
||||
def export_features(e, include_answer = true)
|
||||
features = export(e, @features)
|
||||
return features unless include_answer
|
||||
return features if !include_answer
|
||||
features << (e.has?(@question.name) ?
|
||||
e.get(@question.name) : @question.default)
|
||||
features
|
||||
|
@ -80,9 +80,11 @@ class Treat::Learning::Problem
|
|||
|
||||
def export(entity, exports)
|
||||
unless @question.target == entity.type
|
||||
targ, type = @question.target, entity.type
|
||||
raise Treat::Exception,
|
||||
"This classification problem targets #{@question.target}s, " +
|
||||
"but a(n) #{entity.type} was passed to export instead."
|
||||
"This classification problem targets " +
|
||||
"#{targ}s, but a(n) #{type} " +
|
||||
"was passed to export instead."
|
||||
end
|
||||
ret = []
|
||||
exports.each do |export|
|
||||
|
@ -116,9 +118,8 @@ class Treat::Learning::Problem
|
|||
question = Treat::Learning::Question.new(
|
||||
hash['question']['name'],
|
||||
hash['question']['target'],
|
||||
hash['question']['type'],
|
||||
hash['question']['default'],
|
||||
hash['question']['labels']
|
||||
hash['question']['type']
|
||||
)
|
||||
features = []
|
||||
hash['features'].each do |feature|
|
||||
|
|
|
@ -16,12 +16,9 @@ class Treat::Learning::Question
|
|||
attr_reader :type
|
||||
# Default for the answer to the question.
|
||||
attr_reader :default
|
||||
# A list of possible answers to the question.
|
||||
attr_reader :labels
|
||||
|
||||
# Initialize the question.
|
||||
def initialize(name, target,
|
||||
type = :continuous, default = nil, labels = [])
|
||||
def initialize(name, target, default = nil, type = :continuous)
|
||||
unless name.is_a?(Symbol)
|
||||
raise Treat::Exception,
|
||||
"Question name should be a symbol."
|
||||
|
@ -35,8 +32,8 @@ class Treat::Learning::Question
|
|||
raise Treat::Exception, "Type should be " +
|
||||
"continuous or discrete."
|
||||
end
|
||||
@name, @target, @type, @default, @labels =
|
||||
name, target, type, default, labels
|
||||
@name, @target, @type, @default =
|
||||
name, target, type, default
|
||||
end
|
||||
|
||||
# Custom comparison operator for questions.
|
||||
|
@ -44,8 +41,7 @@ class Treat::Learning::Question
|
|||
@name == question.name &&
|
||||
@type == question.type &&
|
||||
@target == question.target &&
|
||||
@default == question.default &&
|
||||
@labels = question.labels
|
||||
@default == question.default
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,52 @@
|
|||
class Treat::Loaders::BindIt
|
||||
|
||||
# Keep track of whether its loaded or not.
|
||||
@@loaded = {}
|
||||
|
||||
# Load CoreNLP package for a given language.
|
||||
def self.load(klass, name, language = nil)
|
||||
|
||||
return if @@loaded[klass]
|
||||
|
||||
language ||= Treat.core.language.default
|
||||
|
||||
jar_path = Treat.libraries[name].jar_path ||
|
||||
Treat.paths.bin + "#{name}/"
|
||||
model_path = Treat.libraries[name].model_path ||
|
||||
Treat.paths.models + "#{name}/"
|
||||
|
||||
if !File.directory?(jar_path)
|
||||
raise Treat::Exception, "Looking for #{klass} " +
|
||||
"library JAR files in #{jar_path}, but it is " +
|
||||
"not a directory. Please set the config option " +
|
||||
"Treat.libraries.#{name}.jar_path to a folder " +
|
||||
"containing the appropriate JAR files."
|
||||
end
|
||||
|
||||
if !File.directory?(model_path)
|
||||
raise Treat::Exception, "Looking for #{klass} " +
|
||||
"library model files in #{model_path}, but it " +
|
||||
"is not a directory. Please set the config option " +
|
||||
"Treat.libraries.#{name}.model_path to a folder " +
|
||||
"containing the appropriate JAR files."
|
||||
end
|
||||
|
||||
klass.jar_path = jar_path
|
||||
klass.model_path = model_path
|
||||
klass.use language
|
||||
|
||||
if Treat.core.verbosity.silence
|
||||
if Gem.win_platform?
|
||||
klass.log_file = 'NUL'
|
||||
else
|
||||
klass.log_file = '/dev/null'
|
||||
end
|
||||
end
|
||||
|
||||
klass.bind
|
||||
|
||||
@@loaded[klass] = true
|
||||
|
||||
end
|
||||
|
||||
end
|
|
@ -10,14 +10,13 @@ class Treat::Loaders::Linguistics
|
|||
# to the supplied language; raises an exception
|
||||
# if there is no such language class registered.
|
||||
def self.load(language)
|
||||
silence_warnings do
|
||||
# Linguistics throws warnings; silence them.
|
||||
silence_warnings { require 'linguistics' }
|
||||
code = language.to_s[0..1].upcase
|
||||
@@languages[language] ||=
|
||||
::Linguistics.const_get(code)
|
||||
code = language.to_s[0..1].intern # FIX
|
||||
unless @@languages[language]
|
||||
require 'linguistics'
|
||||
Linguistics.use(code)
|
||||
@@languages[language] = true
|
||||
end
|
||||
return @@languages[language]
|
||||
code
|
||||
rescue RuntimeError
|
||||
raise Treat::Exception,
|
||||
"Ruby Linguistics does not have a module " +
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
require_relative 'bind_it'
|
||||
|
||||
# A helper class to load the OpenNLP package.
|
||||
class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt
|
||||
|
||||
def self.load(language = nil)
|
||||
require 'open-nlp'
|
||||
super(OpenNLP, :open_nlp, language)
|
||||
end
|
||||
|
||||
end
|
|
@ -1,24 +1,20 @@
|
|||
# A helper class to load the CoreNLP package.
|
||||
class Treat::Loaders::Stanford
|
||||
|
||||
# Keep track of whether its loaded or not.
|
||||
@@loaded = false
|
||||
require_relative 'bind_it'
|
||||
|
||||
# Load CoreNLP package for a given language.
|
||||
def self.load(language = nil)
|
||||
return if @@loaded
|
||||
require 'stanford-core-nlp'
|
||||
language ||= Treat.core.language.default
|
||||
StanfordCoreNLP.jar_path =
|
||||
Treat.libraries.stanford.jar_path ||
|
||||
Treat.paths.bin + 'stanford/'
|
||||
StanfordCoreNLP.model_path =
|
||||
Treat.libraries.stanford.model_path ||
|
||||
Treat.paths.models + 'stanford/'
|
||||
StanfordCoreNLP.use(language)
|
||||
StanfordCoreNLP.log_file = '/dev/null' if
|
||||
Treat.core.verbosity.silence
|
||||
StanfordCoreNLP.bind; @@loaded = true
|
||||
end
|
||||
# A helper class to load the CoreNLP package.
|
||||
class Treat::Loaders::Stanford < Treat::Loaders::BindIt
|
||||
|
||||
end
|
||||
def self.load(language = nil)
|
||||
require 'stanford-core-nlp'
|
||||
super(StanfordCoreNLP, :stanford, language)
|
||||
end
|
||||
|
||||
def self.find_model(name, language)
|
||||
language = language.intern
|
||||
model_file = StanfordCoreNLP::Config::Models[name][language]
|
||||
model_dir = StanfordCoreNLP::Config::ModelFolders[name]
|
||||
model_path = Treat.libraries.stanford.model_path ||
|
||||
File.join(Treat.paths.models, 'stanford')
|
||||
File.join(model_path, model_dir, model_file)
|
||||
end
|
||||
|
||||
end
|
|
@ -1,13 +1,13 @@
|
|||
module Treat
|
||||
|
||||
# Contains common utility/helper functions.
|
||||
module Helpers; include Autoload; end
|
||||
|
||||
# Contains all the configuration options.
|
||||
module Config; include Autoload; end
|
||||
|
||||
# Load all the configuration options.
|
||||
Treat::Config.configure!
|
||||
|
||||
# Contains common utility/helper functions.
|
||||
module Helpers; include Autoload; end
|
||||
# Import all the configuration options.
|
||||
Treat::Config.import!
|
||||
|
||||
# Contains classes to load external libraries.
|
||||
module Loaders; include Autoload; end
|
||||
|
@ -20,7 +20,10 @@ module Treat
|
|||
|
||||
# Contains all the worker categories.
|
||||
module Workers; include Autoload; end
|
||||
|
||||
|
||||
# Make all the worker categories.
|
||||
Treat::Workers.categorize!
|
||||
|
||||
# Installs builders on core Ruby objects.
|
||||
module Proxies; include Autoload; end
|
||||
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
module Treat::Proxies
|
||||
|
||||
module Array
|
||||
# Include base proxy functionality.
|
||||
include Treat::Proxies::Proxy
|
||||
def method_missing(sym, *args, &block)
|
||||
if [:do, :apply].include?(sym) ||
|
||||
Treat::Workers.lookup(sym)
|
||||
map do |el|
|
||||
if el.is_a?(Treat::Entities::Entity)
|
||||
el.send(sym, *args)
|
||||
else
|
||||
el.to_entity.send(sym, *args)
|
||||
end
|
||||
end
|
||||
else
|
||||
super(sym, *args, &block)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Include Treat methods on numerics.
|
||||
::Array.class_eval do
|
||||
include Treat::Proxies::Array
|
||||
end
|
||||
|
||||
end
|
|
@ -21,17 +21,18 @@ module Treat::Proxies
|
|||
!Treat.core.language.detect
|
||||
|
||||
if is_a?(Treat::Entities::Symbol) ||
|
||||
is_a?(Treat::Entities::Number)
|
||||
is_a?(Treat::Entities::Number) ||
|
||||
is_a?(Treat::Entities::Punctuation)
|
||||
return Treat.core.language.default
|
||||
end
|
||||
|
||||
dlvl = Treat.core.language.detect_at
|
||||
dklass = Treat::Entities.const_get(dlvl.cc)
|
||||
|
||||
if self.class.compare_with(
|
||||
dklass) < 1 && has_parent?
|
||||
if self.class.compare_with(dklass) < 1
|
||||
anc = ancestor_with_type(dlvl)
|
||||
return anc.language if anc
|
||||
return self.parent.language if has_parent?
|
||||
end
|
||||
|
||||
extractor ||= Treat.workers.
|
||||
|
|
|
@ -10,15 +10,16 @@ module Treat::Proxies
|
|||
# object and send the method call to the entity.
|
||||
def method_missing(sym, *args, &block)
|
||||
if [:do, :apply].include?(sym) ||
|
||||
Treat::Workers::Category.lookup(sym)
|
||||
to_entity.send(sym, *args)
|
||||
Treat::Workers.lookup(sym)
|
||||
to_entity.send(sym, *args)
|
||||
else
|
||||
super(sym, *args, &block)
|
||||
end
|
||||
end
|
||||
|
||||
# Create an unknown type of entity by default.
|
||||
def to_entity(builder = nil)
|
||||
Treat::Entities::Unknown(self.to_s)
|
||||
Treat::Entities::Unknown.new(self.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
module Treat
|
||||
|
||||
|
||||
# The current version of Treat.
|
||||
VERSION = "1.2.0"
|
||||
|
||||
VERSION = '2.1.0'
|
||||
|
||||
# Treat requires Ruby >= 1.9.2
|
||||
if RUBY_VERSION < '1.9.2'
|
||||
raise "Treat requires Ruby version 1.9.2 " +
|
||||
"or higher, but current is #{RUBY_VERSION}."
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,51 +1,49 @@
|
|||
# This module creates all the worker categories
|
||||
# and the groups within these categories and adds
|
||||
# the relevant hooks on the appropriate entities.
|
||||
module Treat::Workers::Category
|
||||
module Treat::Workers::Categorizable
|
||||
|
||||
require_relative 'group'
|
||||
require_relative 'groupable'
|
||||
|
||||
# A lookup table for entity types.
|
||||
@@lookup = {}
|
||||
|
||||
# Find a worker group based on method.
|
||||
def self.lookup(method)
|
||||
@@lookup[method]
|
||||
end
|
||||
def lookup(method); @@lookup[method]; end
|
||||
|
||||
def self.create_categories
|
||||
def categorize!
|
||||
Treat.workers.members.each do |cat|
|
||||
create_category(cat.
|
||||
capitalize.intern,
|
||||
load_category_conf(cat))
|
||||
name = cat.capitalize.intern
|
||||
conf = load_category_conf(cat)
|
||||
create_category(name, conf)
|
||||
end
|
||||
end
|
||||
|
||||
def self.load_category_conf(name)
|
||||
config = Treat.workers[name]
|
||||
if config.nil?
|
||||
def load_category_conf(name)
|
||||
if !Treat.workers.respond_to?(name)
|
||||
raise Treat::Exception,
|
||||
"The configuration file " +
|
||||
"for #{cat_sym} is missing."
|
||||
else
|
||||
Treat.workers[name]
|
||||
end
|
||||
config
|
||||
end
|
||||
|
||||
def self.create_category(name, conf)
|
||||
def create_category(name, conf)
|
||||
category = Treat::Workers.
|
||||
const_set(name, Module.new)
|
||||
conf.each_pair do |group, worker|
|
||||
name = group.to_s.cc.intern
|
||||
category.module_eval do
|
||||
@@methods = []; def methods;
|
||||
@@methods; end; def groups;
|
||||
self.constants; end
|
||||
@@methods = []
|
||||
def methods; @@methods; end
|
||||
def groups; self.constants; end
|
||||
end
|
||||
self.create_group(name, worker, category)
|
||||
create_group(name, worker, category)
|
||||
end
|
||||
end
|
||||
|
||||
def self.create_group(name, conf, category)
|
||||
def create_group(name, conf, category)
|
||||
group = category.const_set(name, Module.new)
|
||||
self.set_group_options(group, conf)
|
||||
self.bind_group_targets(group)
|
||||
|
@ -54,27 +52,9 @@ module Treat::Workers::Category
|
|||
@@lookup[group.method] = group
|
||||
end
|
||||
|
||||
def self.bind_group_targets(group)
|
||||
group.targets.each do |entity_type|
|
||||
entity = Treat::Entities.
|
||||
const_get(entity_type.cc)
|
||||
entity.class_eval do
|
||||
add_workers group
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def self.register_group_presets(group, conf)
|
||||
return unless conf.respond_to? :presets
|
||||
conf.presets.each do |m|
|
||||
@@methods << m
|
||||
@@lookup[m] = group
|
||||
end
|
||||
end
|
||||
|
||||
def self.set_group_options(group, conf)
|
||||
def set_group_options(group, conf)
|
||||
group.module_eval do
|
||||
extend Treat::Workers::Group
|
||||
extend Treat::Workers::Groupable
|
||||
self.type = conf.type
|
||||
self.targets = conf.targets
|
||||
if conf.respond_to?(:default)
|
||||
|
@ -92,6 +72,22 @@ module Treat::Workers::Category
|
|||
end
|
||||
end
|
||||
|
||||
self.create_categories
|
||||
def bind_group_targets(group)
|
||||
group.targets.each do |entity_type|
|
||||
entity = Treat::Entities.
|
||||
const_get(entity_type.cc)
|
||||
entity.class_eval do
|
||||
add_workers group
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def register_group_presets(group, conf)
|
||||
return unless conf.respond_to?(:presets)
|
||||
conf.presets.each do |method|
|
||||
@@methods << method
|
||||
@@lookup[method] = group
|
||||
end
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,35 @@
|
|||
# The C extension uses char* strings, and so Unicode strings
|
||||
# will give incorrect distances. Need to provide a pure
|
||||
# implementation if that's the case (FIX).
|
||||
class Treat::Workers::Extractors::Distance::Levenshtein
|
||||
|
||||
require 'levenshtein'
|
||||
|
||||
DefaultOptions = {
|
||||
ins_cost: 1,
|
||||
del_cost: 1,
|
||||
sub_cost: 1
|
||||
}
|
||||
|
||||
@@matcher = nil
|
||||
|
||||
# Return the levensthein distance between
|
||||
# two strings taking into account the costs
|
||||
# of insertion, deletion, and substitution.
|
||||
def self.distance(entity, options)
|
||||
|
||||
options = DefaultOptions.merge(options)
|
||||
|
||||
unless options[:to]
|
||||
raise Treat::Exception, "Must supply " +
|
||||
"a string/entity to compare to using " +
|
||||
"the option :to for this worker."
|
||||
end
|
||||
|
||||
a, b = entity.to_s, options[:to].to_s
|
||||
|
||||
Levenshtein.distance(a, b)
|
||||
|
||||
end
|
||||
|
||||
end
|
|
@ -23,19 +23,16 @@ class Treat::Workers::Extractors::Keywords::TfIdf
|
|||
|
||||
tf_idfs = tf_idfs.
|
||||
sort_by {|k,v| v}.reverse
|
||||
|
||||
if tf_idfs.size <= options[:number]
|
||||
return tf_idfs
|
||||
end
|
||||
|
||||
|
||||
keywords = []
|
||||
i = 0
|
||||
max_count = tf_idfs.size < options[:number] ? tf_idfs.size : options[:number]
|
||||
|
||||
tf_idfs.each do |word|
|
||||
|
||||
w = word[0].to_s
|
||||
next if keywords.include?(w)
|
||||
break if i > options[:number]
|
||||
break if i > max_count
|
||||
keywords << w
|
||||
|
||||
i += 1
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
# Language detection using a probabilistic algorithm
|
||||
# that checks for the presence of words with Bloom
|
||||
# that checks for the presence of words with Bloom
|
||||
# filters built from dictionaries for each language.
|
||||
#
|
||||
# Original paper: Grothoff. 2007. A Quick Introduction to
|
||||
# Bloom Filters. Department of Computer Sciences, Purdue
|
||||
# Original paper: Grothoff. 2007. A Quick Introduction to
|
||||
# Bloom Filters. Department of Computer Sciences, Purdue
|
||||
# University.
|
||||
class Treat::Workers::Extractors::Language::WhatLanguage
|
||||
|
||||
|
@ -35,7 +35,7 @@ class Treat::Workers::Extractors::Language::WhatLanguage
|
|||
|
||||
options = DefaultOptions.merge(options)
|
||||
|
||||
@@detector ||= ::WhatLanguage.new(:possibilities)
|
||||
@@detector ||= ::WhatLanguage.new(:all)
|
||||
possibilities = @@detector.process_text(entity.to_s)
|
||||
lang = {}
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Named entity tag extraction using the Stanford NLP
|
||||
# Deterministic Coreference Resolver, which implements a
|
||||
# multi-pass sieve coreference resolution (or anaphora
|
||||
# resolution) system.
|
||||
# resolution) system based on conditional random fields.
|
||||
#
|
||||
# Original paper: Heeyoung Lee, Yves Peirsman, Angel
|
||||
# Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
|
||||
|
@ -16,32 +16,24 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
|||
|
||||
def self.name_tag(entity, options = {})
|
||||
|
||||
pp = nil
|
||||
|
||||
language = entity.language
|
||||
|
||||
Treat::Loaders::Stanford.load(language)
|
||||
|
||||
isolated_token = entity.is_a?(Treat::Entities::Token)
|
||||
tokens = isolated_token ? [entity] : entity.tokens
|
||||
|
||||
ms = StanfordCoreNLP::Config::Models[:ner][language]
|
||||
model_path = Treat.libraries.stanford.model_path ||
|
||||
(Treat.paths.models + '/stanford/')
|
||||
ms = model_path + '/' +
|
||||
StanfordCoreNLP::Config::ModelFolders[:ner] +
|
||||
ms['3class']
|
||||
|
||||
@@classifiers[language] ||=
|
||||
StanfordCoreNLP::CRFClassifier.
|
||||
getClassifier(ms)
|
||||
|
||||
|
||||
unless classifier = @@classifiers[language]
|
||||
model = Treat::Loaders::Stanford.find_model(:ner, language)
|
||||
unless StanfordCoreNLP.const_defined?('CRFClassifier')
|
||||
StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
|
||||
end
|
||||
classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
|
||||
@@classifiers[language] = classifier
|
||||
end
|
||||
|
||||
token_list = StanfordCoreNLP.get_list(tokens)
|
||||
sentence = @@classifiers[language].
|
||||
classify_sentence(token_list)
|
||||
|
||||
sentence = classifier.classify_sentence(token_list)
|
||||
i = 0
|
||||
n = 0
|
||||
|
||||
sentence.each do |s_token|
|
||||
tag = s_token.get(:answer).to_s.downcase
|
||||
|
@ -49,14 +41,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
|||
return tag if isolated_token
|
||||
if tag
|
||||
tokens[i].set :name_tag, tag
|
||||
n += 1
|
||||
end
|
||||
i += 1
|
||||
end
|
||||
|
||||
entity.set :named_entity_count, n
|
||||
|
||||
nil
|
||||
|
||||
end
|
||||
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
# Similarity measure for short strings such as person names.
|
||||
# C extension won't work for Unicode strings; need to set
|
||||
# extension to "pure" in that case (FIX).
|
||||
class Treat::Workers::Extractors::Similarity::JaroWinkler
|
||||
|
||||
require 'fuzzystringmatch'
|
||||
|
||||
DefaultOptions = {
|
||||
threshold: 0.7,
|
||||
implementation: nil
|
||||
}
|
||||
|
||||
@@matcher = nil
|
||||
|
||||
def self.similarity(entity, options={})
|
||||
|
||||
options = DefaultOptions.merge(options)
|
||||
|
||||
unless options[:to]
|
||||
raise Treat::Exception, "Must supply " +
|
||||
"a string/entity to compare to using " +
|
||||
"the option :to for this worker."
|
||||
end
|
||||
|
||||
unless @@matcher
|
||||
impl = options[:implementation]
|
||||
impl ||= defined?(JRUBY_VERSION) ? :pure : :native
|
||||
klass = FuzzyStringMatch::JaroWinkler
|
||||
@@matcher = klass.create(impl)
|
||||
end
|
||||
|
||||
a, b = entity.to_s, options[:to].to_s
|
||||
|
||||
@@matcher.getDistance(a, b)
|
||||
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,43 @@
|
|||
# Calculates the TF*IDF score of words.
|
||||
class Treat::Workers::Extractors::Similarity::TfIdf
|
||||
|
||||
require 'tf-idf-similarity'
|
||||
|
||||
def self.similarity(entity, options={})
|
||||
|
||||
raise 'Not currently implemented.'
|
||||
|
||||
unless options[:to] &&
|
||||
options[:to].type == :document
|
||||
raise Treat::Exception, 'Must supply ' +
|
||||
'a document to compare to using ' +
|
||||
'the option :to for this worker.'
|
||||
end
|
||||
|
||||
unless options[:to].parent_collection &&
|
||||
entity.parent_collection
|
||||
raise Treat::Exception, 'The TF*IDF ' +
|
||||
'similarity algorithm can only be applied ' +
|
||||
'to documents that are inside collections.'
|
||||
end
|
||||
|
||||
coll = TfIdfSimilarity::Collection.new
|
||||
|
||||
entity.each_document do |doc|
|
||||
tdoc = TfIdfSimilarity::Document.new(doc.to_s)
|
||||
term_counts = Hash.new(0)
|
||||
doc.each_word do |word|
|
||||
val = word.value.downcase
|
||||
term_counts[val] ||= 0.0
|
||||
term_counts[val] += 1.0
|
||||
end
|
||||
size = term_counts.values.reduce(:+)
|
||||
tdoc.instance_eval do
|
||||
@term_counts, @size = term_counts, size
|
||||
end
|
||||
coll << tdoc
|
||||
end
|
||||
puts coll.similarity_matrix.inspect
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,20 @@
|
|||
# Time/date extraction using a simple rule-based library.
|
||||
#
|
||||
# Supported formats: Today, yesterday, tomorrow,
|
||||
# last thursday, this thursday, 14 Sep, 14 June 2010.
|
||||
# Any dates without a year are assumed to be in the past.
|
||||
class Treat::Workers::Extractors::Time::Kronic
|
||||
|
||||
require 'kronic'
|
||||
require 'date'
|
||||
|
||||
# Return the date information contained within
|
||||
# the entity by parsing it with the 'chronic' gem.
|
||||
#
|
||||
# Options: none.
|
||||
def self.time(entity, options = {})
|
||||
time = Kronic.parse(entity.to_s)
|
||||
time.is_a?(DateTime) ? time : nil
|
||||
end
|
||||
|
||||
end
|
|
@ -53,9 +53,9 @@ class Treat::Workers::Extractors::TopicWords::LDA
|
|||
# Run the EM algorithm using random
|
||||
# starting points
|
||||
|
||||
silence_stdout do
|
||||
lda.em('random')
|
||||
end
|
||||
Treat.core.verbosity.silence ?
|
||||
silence_stdout { lda.em('random') } :
|
||||
lda.em('random')
|
||||
|
||||
# Load the vocabulary.
|
||||
if options[:vocabulary]
|
||||
|
|
|
@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Readers::Autoselect
|
|||
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
|
||||
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
||||
DefaultOptions = {
|
||||
:default_to => 'txt'
|
||||
:default_to => 'document'
|
||||
}
|
||||
|
||||
# Choose a reader to use.
|
||||
|
@ -12,7 +12,9 @@ class Treat::Workers::Formatters::Readers::Autoselect
|
|||
# - (Symbol) :default_to => format to default to.
|
||||
def self.read(document, options = {})
|
||||
options = DefaultOptions.merge(options)
|
||||
document.read(detect_format(document.file, options[:default_to]))
|
||||
fmt = detect_format(document.file, options[:default_to])
|
||||
Treat::Workers::Formatters::Readers.
|
||||
const_get(fmt.cc).read(document,options)
|
||||
end
|
||||
|
||||
def self.detect_format(filename, default_to = nil)
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
require 'yomu'
|
||||
|
||||
# This class is a wrapper for Yomu.
|
||||
# Yomu is a library for extracting text and metadata from files and documents
|
||||
# using the Apache Tika content analysis toolkit.
|
||||
class Treat::Workers::Formatters::Readers::Document
|
||||
# Extract the readable text from any document.
|
||||
#
|
||||
# Options: none.
|
||||
def self.read(document, options = {})
|
||||
yomu = Yomu.new(document.file)
|
||||
|
||||
document.value = yomu.text
|
||||
document.set :format, yomu.mimetype.extensions.first
|
||||
document
|
||||
end
|
||||
end
|
|
@ -11,7 +11,8 @@ class Treat::Workers::Formatters::Readers::HTML
|
|||
# By default, don't backup the original HTML
|
||||
DefaultOptions = {
|
||||
:keep_html => false,
|
||||
:tags => %w[p div h1 h2 h3 ul ol dl dt li]
|
||||
:tags => %w[p div h1 h2 h3 ul ol dl dt li img],
|
||||
|
||||
}
|
||||
|
||||
# Read the HTML document and strip it of its markup.
|
||||
|
@ -46,6 +47,7 @@ class Treat::Workers::Formatters::Readers::HTML
|
|||
d = Readability::Document.new(html, options)
|
||||
document.value = "<h1>#{d.title}</h1>\n" + d.content
|
||||
document.set :format, 'html'
|
||||
document.set :images, d.images
|
||||
end
|
||||
|
||||
document
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
# statistical natural language modeling, and multi-
|
||||
# lingual capabilities."
|
||||
#
|
||||
# Original paper: Google Ocropus Engine: Breuel,
|
||||
# Thomas M. The Ocropus Open Source OCR System.
|
||||
# Original paper: Google Ocropus Engine: Breuel,
|
||||
# Thomas M. The Ocropus Open Source OCR System.
|
||||
# DFKI and U. Kaiserslautern, Germany.
|
||||
class Treat::Workers::Formatters::Readers::Image
|
||||
|
||||
|
@ -18,29 +18,31 @@ class Treat::Workers::Formatters::Readers::Image
|
|||
#
|
||||
# - (Boolean) :silent => whether to silence Ocropus.
|
||||
def self.read(document, options = {})
|
||||
|
||||
|
||||
read = lambda do |doc|
|
||||
self.create_temp_dir do |tmp|
|
||||
`ocropus book2pages #{tmp}/out #{doc.file}`
|
||||
`ocropus pages2lines #{tmp}/out`
|
||||
`ocropus lines2fsts #{tmp}/out`
|
||||
`ocropus buildhtml #{tmp}/out > #{tmp}/output.html`
|
||||
doc.set :file, "#{tmp}/output.html"
|
||||
`ocropus-nlbin -o #{tmp}/out #{doc.file}`
|
||||
`ocropus-gpageseg #{tmp}/out/????.bin.png --minscale 2`
|
||||
`ocropus-rpred #{tmp}/out/????/??????.bin.png`
|
||||
`ocropus-hocr #{tmp}/out/????.bin.png -o #{tmp}/book.html`
|
||||
doc.set :file, "#{tmp}/book.html"
|
||||
doc.set :format, :html
|
||||
|
||||
doc = doc.read(:html)
|
||||
doc.set :file, f
|
||||
doc.set :format, 'image'
|
||||
end
|
||||
end
|
||||
|
||||
options[:silent] ? silence_stdout {
|
||||
|
||||
Treat.core.verbosity.silence ? silence_stdout {
|
||||
read.call(document) } : read.call(document)
|
||||
|
||||
|
||||
document
|
||||
|
||||
end
|
||||
|
||||
# Create a dire that gets deleted after execution of the block.
|
||||
|
||||
# Create a dir that gets deleted after execution of the block.
|
||||
def self.create_temp_dir(&block)
|
||||
if not FileTest.directory?(Treat.paths.tmp)
|
||||
FileUtils.mkdir(Treat.paths.tmp)
|
||||
end
|
||||
dname = Treat.paths.tmp +
|
||||
"#{Random.rand(10000000).to_s}"
|
||||
Dir.mkdir(dname)
|
||||
|
@ -48,5 +50,5 @@ class Treat::Workers::Formatters::Readers::Image
|
|||
ensure
|
||||
FileUtils.rm_rf(dname)
|
||||
end
|
||||
|
||||
|
||||
end
|
|
@ -32,6 +32,9 @@ class Treat::Workers::Formatters::Readers::PDF
|
|||
# Create a temporary file which is deleted
|
||||
# after execution of the block.
|
||||
def self.create_temp_file(ext, value = nil, &block)
|
||||
if not FileTest.directory?(Treat.paths.tmp)
|
||||
FileUtils.mkdir(Treat.paths.tmp)
|
||||
end
|
||||
fname = Treat.paths.tmp +
|
||||
"#{Random.rand(10000000).to_s}.#{ext}"
|
||||
File.open(fname, 'w') do |f|
|
||||
|
|
|
@ -30,7 +30,7 @@ class Treat::Workers::Formatters::Readers::XML
|
|||
@@xml_reader ||= StanfordCoreNLP.load(
|
||||
:tokenize, :ssplit, :cleanxml)
|
||||
|
||||
text = StanfordCoreNLP::Text.new(xml)
|
||||
text = StanfordCoreNLP::Annotation.new(xml)
|
||||
@@xml_reader.annotate(text)
|
||||
|
||||
text.get(:sentences).each do |sentence|
|
||||
|
|
|
@ -9,18 +9,19 @@ class Treat::Workers::Formatters::Serializers::XML
|
|||
# - (String) :file => a file to write to.
|
||||
def self.serialize(entity, options = {})
|
||||
options[:file] ||= (entity.id.to_s + '.xml')
|
||||
if options[:indent].nil?
|
||||
options = options.merge({:indent => 0})
|
||||
end
|
||||
indent = options[:indent]
|
||||
if options[:indent] == 0
|
||||
enc = entity.to_s.encoding.to_s.downcase
|
||||
string = "<?xml version=\"1.0\" " +
|
||||
"encoding=\"#{enc}\" ?>\n<treat>\n"
|
||||
else
|
||||
string = ''
|
||||
end
|
||||
spaces = ''
|
||||
options[:indent] = 0
|
||||
enc = entity.to_s.encoding.to_s.downcase
|
||||
string = "<?xml version=\"1.0\" " +
|
||||
"encoding=\"#{enc}\" ?>\n<treat>\n"
|
||||
val = self.recurse(entity, options)
|
||||
string += "#{val}\n</treat>"
|
||||
File.open(options[:file], 'w') do |f|
|
||||
f.write(string)
|
||||
end; return options[:file]
|
||||
end
|
||||
|
||||
def self.recurse(entity, options)
|
||||
spaces, string = '', ''
|
||||
options[:indent].times { spaces << ' ' }
|
||||
attributes = " id='#{entity.id}'"
|
||||
if !entity.features.nil? && entity.features.size != 0
|
||||
|
@ -56,27 +57,16 @@ class Treat::Workers::Formatters::Serializers::XML
|
|||
if entity.has_children?
|
||||
options[:indent] += 1
|
||||
entity.children.each do |child|
|
||||
string =
|
||||
string +
|
||||
serialize(child, options)
|
||||
string += self.recurse(child, options)
|
||||
end
|
||||
options[:indent] -= 1
|
||||
else
|
||||
string = string + "#{escape(entity.value)}"
|
||||
string += "#{escape(entity.value)}"
|
||||
end
|
||||
unless entity.is_a?(Treat::Entities::Token)
|
||||
string += "#{spaces}"
|
||||
end
|
||||
string += "</#{tag}>\n"
|
||||
if indent == 0
|
||||
string += "\n</treat>"
|
||||
if options[:file]
|
||||
File.open(options[:file], 'w') do |f|
|
||||
f.write(string)
|
||||
end
|
||||
end
|
||||
end
|
||||
options[:file]
|
||||
end
|
||||
|
||||
def self.escape(input)
|
||||
|
|
|
@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Serializers::YAML
|
|||
|
||||
silence_warnings do
|
||||
# Require the Psych YAML serializer.
|
||||
require 'psych'
|
||||
require 'yaml'
|
||||
end
|
||||
|
||||
# Serialize an entity in YAML format.
|
||||
|
@ -11,7 +11,7 @@ class Treat::Workers::Formatters::Serializers::YAML
|
|||
# Options:
|
||||
# - (String) :file => a file to write to.
|
||||
def self.serialize(entity, options = {})
|
||||
yaml = ::Psych.dump(entity)
|
||||
yaml = ::YAML.dump(entity)
|
||||
options[:file] ||= (entity.id.to_s + '.yml')
|
||||
if options[:file]
|
||||
File.open(options[:file], 'w') do |f|
|
||||
|
|
|
@ -17,7 +17,7 @@ class Treat::Workers::Formatters::Unserializers::Mongo
|
|||
|
||||
@@database ||= Mongo::Connection.
|
||||
new(Treat.databases.mongo.host).
|
||||
db(Treat.databases.mongo.db || db)
|
||||
db(db || Treat.databases.mongo.db)
|
||||
|
||||
supertype = Treat::Entities.const_get(
|
||||
entity.type.to_s.capitalize.intern).superclass.mn.downcase
|
||||
|
|
|
@ -65,6 +65,7 @@ class Treat::Workers::Formatters::Unserializers::XML
|
|||
value = v
|
||||
else
|
||||
v = v[1..-1].intern if v[0] == ':'
|
||||
v = ":".intern if v == :''
|
||||
v = v.to_i if v =~ /^[0-9]*$/
|
||||
v = v.to_f if v =~ /^[0-9\.]*$/
|
||||
v = false if v == 'false'
|
||||
|
|
|
@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Unserializers::YAML
|
|||
|
||||
silence_warnings do
|
||||
# Require the Psych YAML parser.
|
||||
require 'psych'
|
||||
require 'yaml'
|
||||
end
|
||||
|
||||
# Require date to revive DateTime.
|
||||
|
@ -13,7 +13,7 @@ class Treat::Workers::Formatters::Unserializers::YAML
|
|||
#
|
||||
# Options: none.
|
||||
def self.unserialize(document, options = {})
|
||||
document << ::Psych.load(
|
||||
document << ::YAML.load(
|
||||
File.read(document.file))
|
||||
document
|
||||
end
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
module Treat::Workers::Group
|
||||
module Treat::Workers::Groupable
|
||||
|
||||
# Lazily load the worker classes in the group.
|
||||
def const_missing(const)
|
||||
bits = self.ancestors[0].to_s.split('::')
|
||||
bits.collect! { |bit| bit.ucc }
|
||||
file = bits.join('/') + "/#{const.ucc}"
|
||||
if not File.readable?(Treat.paths.lib + "#{file}.rb")
|
||||
path = Treat.paths.lib + "#{file}.rb"
|
||||
if not File.readable?(path)
|
||||
raise Treat::Exception,
|
||||
"File '#{file}.rb' corresponding to " +
|
||||
"requested worker #{self}::#{const} " +
|
||||
|
@ -14,7 +15,7 @@ module Treat::Workers::Group
|
|||
require file
|
||||
if not self.const_defined?(const)
|
||||
raise Treat::Exception,
|
||||
"File #{file} does not define " +
|
||||
"File #{file}.rb does not define " +
|
||||
"#{self}::#{const}."
|
||||
end
|
||||
const_get(const)
|
||||
|
@ -69,9 +70,7 @@ module Treat::Workers::Group
|
|||
|
||||
# Get constants in this module, excluding by
|
||||
# default those defined by parent modules.
|
||||
def const_get(const)
|
||||
super(const, false)
|
||||
end
|
||||
def const_get(const); super(const, false); end
|
||||
|
||||
# Modify the extended class.
|
||||
def self.extended(group)
|
|
@ -35,9 +35,9 @@ class Treat::Workers::Inflectors::Cardinalizers::Linguistics
|
|||
# More specific options when using :type => :ordinal:
|
||||
def self.cardinal(entity, options = {})
|
||||
options = DefaultOptions.merge(options)
|
||||
Treat::Loaders::Linguistics.
|
||||
load(options[:language]).
|
||||
numwords(entity.to_s, options)
|
||||
lang = entity.language
|
||||
code = Treat::Loaders::Linguistics.load(lang)
|
||||
entity.to_s.send(code).numwords(options)
|
||||
end
|
||||
|
||||
end
|
|
@ -35,13 +35,15 @@ module Treat::Workers::Inflectors::Conjugators::Linguistics
|
|||
|
||||
options = Forms[options[:form].to_s] if options[:form]
|
||||
|
||||
klass = Treat::Loaders::Linguistics.load(entity.language)
|
||||
code = Treat::Loaders::Linguistics.load(entity.language)
|
||||
obj = entity.to_s.send(code)
|
||||
|
||||
if options[:mode] == 'infinitive'
|
||||
silence_warnings { klass.infinitive(entity.to_s) }
|
||||
obj.infinitive
|
||||
elsif options[:mode] == 'participle' && options[:tense] == 'present'
|
||||
silence_warnings { klass.present_participle(entity.to_s) }
|
||||
obj.present_participle
|
||||
elsif options[:count] == 'plural' && options.size == 1
|
||||
silence_warnings { klass.plural_verb(entity.to_s) }
|
||||
obj.plural_verb
|
||||
else
|
||||
raise Treat::Exception,
|
||||
'This combination of modes, tenses, persons ' +
|
||||
|
|
|
@ -21,9 +21,9 @@ class Treat::Workers::Inflectors::Declensors::English
|
|||
'option count ("singular" or "plural").'
|
||||
end
|
||||
string = entity.to_s
|
||||
if options[:count] == 'plural'
|
||||
if options[:count].to_s == 'plural'
|
||||
Inflect.plural(string)
|
||||
elsif options[:count] == 'singular'
|
||||
elsif options[:count].to_s == 'singular'
|
||||
Inflect.singular(string)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -17,34 +17,27 @@ class Treat::Workers::Inflectors::Declensors::Linguistics
|
|||
|
||||
cat = entity.get(:category)
|
||||
return if cat && !POS.include?(cat)
|
||||
|
||||
unless options[:count]
|
||||
raise Treat::Exception, 'Must supply ' +
|
||||
':count option ("singular" or "plural").'
|
||||
end
|
||||
|
||||
klass = Treat::Loaders::
|
||||
Linguistics.load(entity.language)
|
||||
string = entity.to_s
|
||||
|
||||
if options[:count] == 'plural'
|
||||
if (entity.has?(:category))
|
||||
result = ''
|
||||
silence_warnings do
|
||||
result = klass.send(
|
||||
:"plural_#{entity.category}",
|
||||
string)
|
||||
end
|
||||
return result
|
||||
else
|
||||
return klass.plural(string)
|
||||
end
|
||||
|
||||
else
|
||||
unless options[:count].to_s == 'plural'
|
||||
raise Treat::Exception,
|
||||
"Ruby Linguistics does not support " +
|
||||
"singularization of words."
|
||||
end
|
||||
|
||||
lang = entity.language
|
||||
code = Treat::Loaders::Linguistics.load(lang)
|
||||
obj = entity.to_s.send(code)
|
||||
|
||||
if cat = entity.get(:category)
|
||||
method = "plural_#{cat}"
|
||||
obj.send(method)
|
||||
else; obj.plural; end
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -11,11 +11,11 @@ class Treat::Workers::Inflectors::Ordinalizers::Linguistics
|
|||
|
||||
# Desribe a number in words in ordinal form, using the
|
||||
# 'linguistics' gem.
|
||||
def self.ordinal(number, options = {})
|
||||
def self.ordinal(entity, options = {})
|
||||
options = DefaultOptions.merge(options)
|
||||
klass = Treat::Loaders::
|
||||
Linguistics.load(options[:language])
|
||||
klass.ordinate(number.to_s)
|
||||
lang = entity.language
|
||||
code = Treat::Loaders::Linguistics.load(lang)
|
||||
entity.to_s.send(code).ordinate
|
||||
end
|
||||
|
||||
end
|
|
@ -12,24 +12,20 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|||
@@classifiers = {}
|
||||
|
||||
def self.classify(entity, options = {})
|
||||
|
||||
set = options[:training]
|
||||
cl = set.problem
|
||||
|
||||
if !@@classifiers[cl]
|
||||
dset = options[:training]
|
||||
prob = dset.problem
|
||||
if !@@classifiers[prob]
|
||||
dec_tree = DecisionTree::ID3Tree.new(
|
||||
cl.feature_labels.map { |l| l.to_s },
|
||||
set.items.map { |i| i[:features]},
|
||||
cl.question.default, cl.question.type)
|
||||
prob.feature_labels.map { |l| l.to_s },
|
||||
dset.items.map { |i| i[:features] },
|
||||
prob.question.default, prob.question.type)
|
||||
dec_tree.train
|
||||
@@classifiers[cl] = dec_tree
|
||||
@@classifiers[prob] = dec_tree
|
||||
else
|
||||
dec_tree = @@classifiers[cl]
|
||||
dec_tree.graph('testingbitch')
|
||||
dec_tree = @@classifiers[prob]
|
||||
end
|
||||
dec_tree.predict(
|
||||
cl.export_features(entity, false)
|
||||
)
|
||||
vect = prob.export_features(entity, false)
|
||||
dec_tree.predict(vect)
|
||||
end
|
||||
|
||||
end
|
|
@ -11,35 +11,23 @@ class Treat::Workers::Learners::Classifiers::Linear
|
|||
}
|
||||
|
||||
def self.classify(entity, options = {})
|
||||
|
||||
options = DefaultOptions.merge(options)
|
||||
set = options[:training]
|
||||
problem = set.problem
|
||||
|
||||
if !@@classifiers[problem]
|
||||
labels = problem.question.labels
|
||||
unless labels
|
||||
raise Treat::Exception,
|
||||
"LibLinear requires that you provide the possible " +
|
||||
"labels to assign to classification items when " +
|
||||
"specifying the question."
|
||||
end
|
||||
param = LParameter.new
|
||||
param.solver_type = options[:solver_type]
|
||||
param.eps = options[:eps]
|
||||
bias = options[:bias]
|
||||
data = set.items.map do |item|
|
||||
self.array_to_hash(item[:features])
|
||||
end
|
||||
prob = LProblem.new(labels, data, bias)
|
||||
@@classifiers[problem] =
|
||||
LModel.new(prob, param)
|
||||
dset = options[:training]
|
||||
prob, items = dset.problem, dset.items
|
||||
if !@@classifiers[prob]
|
||||
lparam = LParameter.new
|
||||
lparam.solver_type = options[:solver_type]
|
||||
lparam.eps = options[:eps]
|
||||
lbls = items.map { |it| it[:features][-1] }
|
||||
exs = items.map { |it| it[:features][0..-2] }.
|
||||
map { |ary| self.array_to_hash(ary) }
|
||||
lprob = LProblem.new(lbls, exs, options[:bias])
|
||||
model = LModel.new(lprob, lparam)
|
||||
@@classifiers[prob] = model
|
||||
end
|
||||
|
||||
@@classifiers[problem].predict(
|
||||
self.array_to_hash(problem.
|
||||
export_features(entity, false)))
|
||||
|
||||
features = prob.export_features(entity, false)
|
||||
@@classifiers[prob].predict(
|
||||
self.array_to_hash(features))
|
||||
end
|
||||
|
||||
def self.array_to_hash(array)
|
||||
|
|
|
@ -1,30 +1,43 @@
|
|||
# Classification based on a multilayer perceptron.
|
||||
class Treat::Workers::Learners::Classifiers::MLP
|
||||
|
||||
require 'ai4r'
|
||||
require 'ruby_fann/neural_network'
|
||||
|
||||
@@mlps = {}
|
||||
DefaultOptions = {
|
||||
num_inputs: 3,
|
||||
hidden_neurons: [2, 8, 4, 3, 4],
|
||||
num_outputs: 1,
|
||||
max_neurons: 1000,
|
||||
neurons_between_reports: 1,
|
||||
desired_error: 0.1
|
||||
}
|
||||
|
||||
@@classifiers = {}
|
||||
|
||||
def self.classify(entity, options = {})
|
||||
|
||||
set = options[:training]
|
||||
cl = set.problem
|
||||
|
||||
if !@@mlps[cl]
|
||||
net = Ai4r::NeuralNetwork::Backpropagation.new(
|
||||
[cl.feature_labels.size, 3, 1])
|
||||
set.items.each do |item|
|
||||
inputs = item[:features][0..-2]
|
||||
outputs = [item[:features][-1]]
|
||||
net.train(inputs, outputs)
|
||||
options = DefaultOptions.merge(options)
|
||||
dset = options[:training]
|
||||
prob, items = dset.problem, dset.items
|
||||
if !@@classifiers[prob]
|
||||
fann = RubyFann::Standard.new(options)
|
||||
inputs = items.map { |it| it[:features][0..-2] }
|
||||
outputs = items.map { |it| [it[:features][-1]] }
|
||||
training = silence_stdout do
|
||||
RubyFann::TrainData.new(inputs:
|
||||
inputs, desired_outputs: outputs)
|
||||
end
|
||||
@@mlps[cl] = net
|
||||
params = [options[:max_neurons],
|
||||
options[:neurons_between_reports],
|
||||
options[:desired_error]]
|
||||
fann.train_on_data(training, *params)
|
||||
@@classifiers[prob] = fann
|
||||
else
|
||||
net = @@mlps[cl]
|
||||
fann = @@classifiers[prob]
|
||||
end
|
||||
|
||||
net.eval(cl.export_features(entity, false))[0]
|
||||
|
||||
vect = prob.export_features(entity, false)
|
||||
Treat.core.verbosity.silence ?
|
||||
silence_stdout { fann.run(vect)[0] } :
|
||||
fann.run(vect)[0]
|
||||
end
|
||||
|
||||
end
|
|
@ -5,7 +5,7 @@ class Treat::Workers::Learners::Classifiers::SVM
|
|||
@@classifiers = {}
|
||||
|
||||
DefaultOptions = {
|
||||
cache_size: 1,
|
||||
cache_size: 1, # in MB
|
||||
eps: 0.001,
|
||||
c: 10
|
||||
}
|
||||
|
@ -14,35 +14,25 @@ class Treat::Workers::Learners::Classifiers::SVM
|
|||
# - (Numeric) :eps => tolerance of termination criterion
|
||||
# - (Numeric) :c => C parameter
|
||||
def self.classify(entity, options = {})
|
||||
|
||||
options = DefaultOptions.merge(options)
|
||||
set = options[:training]
|
||||
problem = set.problem
|
||||
|
||||
if !@@classifiers[problem]
|
||||
labels = problem.question.labels
|
||||
unless labels
|
||||
raise Treat::Exception,
|
||||
"LibSVM requires that you provide the possible " +
|
||||
"labels to assign to classification items when " +
|
||||
"specifying the question."
|
||||
end
|
||||
examples = set.items.map { |item| item[:features] }
|
||||
prob = Libsvm::Problem.new
|
||||
prob.set_examples(labels, examples)
|
||||
param = Libsvm::SvmParameter.new
|
||||
param.cache_size = options[:cache_size]
|
||||
param.eps = options[:eps]
|
||||
param.c = options[:c]
|
||||
model = Libsvm::Model.train(problem, parameter)
|
||||
@@classifiers[problem] = model
|
||||
dset = options[:training]
|
||||
prob, items = dset.problem, dset.items
|
||||
if !@@classifiers[prob]
|
||||
lprob = Libsvm::Problem.new
|
||||
lparam = Libsvm::SvmParameter.new
|
||||
lparam.cache_size = options[:cache_size]
|
||||
lparam.eps = options[:eps]
|
||||
lparam.c = options[:c]
|
||||
llabels = items.map { |it| it[:features][-1] }
|
||||
lexamples = items.map { |it| it[:features][0..-2] }.
|
||||
map { |ary| Libsvm::Node.features(ary) }
|
||||
lprob.set_examples(llabels, lexamples)
|
||||
model = Libsvm::Model.train(lprob, lparam)
|
||||
@@classifiers[prob] = model
|
||||
end
|
||||
|
||||
features = problem.export_features(entity, false)
|
||||
|
||||
@@classifiers[problem].predict(
|
||||
Libsvm::Node.features(*features))
|
||||
|
||||
features = prob.export_features(entity, false)
|
||||
@@classifiers[prob].predict(
|
||||
Libsvm::Node.features(features))
|
||||
end
|
||||
|
||||
end
|
|
@ -28,8 +28,9 @@ class Treat::Workers::Lexicalizers::Categorizers::FromTag
|
|||
|
||||
tag = entity.check_has(:tag)
|
||||
|
||||
return 'unknown' if tag.nil? || tag == '' || entity.type == :symbol
|
||||
return 'sentence' if tag == 'S' || entity.type == :sentence
|
||||
return 'unknown' if tag.nil? || tag == ''
|
||||
return 'fragment' if tag == 'F'
|
||||
return 'sentence' if tag == 'S'
|
||||
return 'number' if entity.type == :number
|
||||
|
||||
return Ptc[entity.to_s] if entity.type == :punctuation
|
||||
|
|
|
@ -1,62 +1,79 @@
|
|||
# Sense information (synonyms, antonyms, hypernyms
|
||||
# and hyponyms) obtained through a Ruby parser that
|
||||
# accesses Wordnet flat files.
|
||||
#
|
||||
# Original paper: George A. Miller (1995). WordNet:
|
||||
# A Lexical Database for English. Communications of
|
||||
#
|
||||
# Original paper: George A. Miller (1995). WordNet:
|
||||
# A Lexical Database for English. Communications of
|
||||
# the ACM Vol. 38, No. 11: 39-41.
|
||||
class Treat::Workers::Lexicalizers::Sensers::Wordnet
|
||||
|
||||
# Require the 'wordnet' gem (install as 'rwordnet').
|
||||
require 'wordnet'
|
||||
|
||||
|
||||
# Patch for bug.
|
||||
::WordNet.module_eval do
|
||||
remove_const(:SynsetType)
|
||||
const_set(:SynsetType,
|
||||
remove_const(:SYNSET_TYPES)
|
||||
const_set(:SYNSET_TYPES,
|
||||
{"n" => "noun", "v" => "verb", "a" => "adj"})
|
||||
end
|
||||
|
||||
|
||||
# Require an adaptor for Wordnet synsets.
|
||||
require_relative 'wordnet/synset'
|
||||
|
||||
# Noun, adjective and verb indexes.
|
||||
@@indexes = {}
|
||||
|
||||
|
||||
# Obtain lexical information about a word using the
|
||||
# ruby 'wordnet' gem.
|
||||
def self.sense(word, options = nil)
|
||||
|
||||
|
||||
category = word.check_has(:category)
|
||||
|
||||
unless options[:nym]
|
||||
|
||||
if !options[:nym]
|
||||
raise Treat::Exception, "You must supply " +
|
||||
"the :nym option (:synonym, :hypernym, etc.)"
|
||||
"the :nym option ('synonyms', 'hypernyms', etc.)"
|
||||
end
|
||||
|
||||
|
||||
if !options[:nym].is_a?(Symbol)
|
||||
options[:nym] = options[:nym].intern
|
||||
end
|
||||
|
||||
if ![:synonyms, :antonyms,
|
||||
:hypernyms, :hyponyms].include?(options[:nym])
|
||||
raise Treat::Exception, "You must supply " +
|
||||
"a valid :nym option ('synonyms', 'hypernyms', etc.)"
|
||||
end
|
||||
|
||||
unless ['noun', 'adjective', 'verb'].
|
||||
include?(word.category)
|
||||
return []
|
||||
end
|
||||
|
||||
cat = category.to_s.capitalize
|
||||
|
||||
@@indexes[cat] ||=
|
||||
::WordNet.const_get(cat + 'Index').instance
|
||||
lemma = @@indexes[cat].find(word.value.downcase)
|
||||
|
||||
cat = abbreviate(category)
|
||||
|
||||
lemma = ::WordNet::Lemma.find(word.value.downcase, cat)
|
||||
|
||||
return [] if lemma.nil?
|
||||
synsets = []
|
||||
|
||||
|
||||
lemma.synsets.each do |synset|
|
||||
synsets <<
|
||||
synsets <<
|
||||
Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
||||
end
|
||||
|
||||
|
||||
((synsets.collect do |ss|
|
||||
ss.send(options[:nym])
|
||||
end - [word.value]).flatten).uniq
|
||||
|
||||
end - [word.value]).
|
||||
flatten).uniq.map do |token|
|
||||
token.gsub('_', ' ')
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
def self.abbreviate category
|
||||
if category == 'adjective'
|
||||
:adj
|
||||
elsif category == 'adverb'
|
||||
:adv
|
||||
else
|
||||
category.to_sym
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -40,15 +40,15 @@ class Treat::Workers::Lexicalizers::Taggers::Brill
|
|||
return pair[1] if isolated_token
|
||||
end
|
||||
|
||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
||||
(entity.is_a?(Treat::Entities::Phrase) &&
|
||||
!entity.parent_sentence)
|
||||
if entity.is_a?(Treat::Entities::Group) &&
|
||||
!entity.parent_sentence
|
||||
entity.set :tag_set, :penn
|
||||
end
|
||||
|
||||
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
||||
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
||||
|
||||
return 'F' if entity.is_a?(Treat::Entities::Fragment)
|
||||
return 'G' if entity.is_a?(Treat::Entities::Group)
|
||||
end
|
||||
|
||||
end
|
|
@ -61,15 +61,16 @@ class Treat::Workers::Lexicalizers::Taggers::Lingua
|
|||
end
|
||||
|
||||
|
||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
||||
(entity.is_a?(Treat::Entities::Phrase) &&
|
||||
!entity.parent_sentence)
|
||||
if entity.is_a?(Treat::Entities::Group) &&
|
||||
!entity.parent_sentence
|
||||
entity.set :tag_set, :penn
|
||||
end
|
||||
|
||||
|
||||
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
||||
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
||||
|
||||
return 'F' if entity.is_a?(Treat::Entities::Fragment)
|
||||
return 'G' if entity.is_a?(Treat::Entities::Group)
|
||||
|
||||
end
|
||||
|
||||
end
|
|
@ -1,15 +1,15 @@
|
|||
# POS tagging using (i) explicit use of both preceding
|
||||
# and following tag contexts via a dependency network
|
||||
# representation, (ii) broad use of lexical features,
|
||||
# including jointly conditioning on multiple consecutive
|
||||
# words, (iii) effective use of priors in conditional
|
||||
# loglinear models, and (iv) fine-grained modeling of
|
||||
# unknown word features.
|
||||
# POS tagging using a maximum entropy model, with (i)
|
||||
# explicit use of both preceding and following tag
|
||||
# contexts via a dependency network representation,
|
||||
# (ii) broad use of lexical features, including jointly
|
||||
# conditioning on multiple consecutive words, (iii)
|
||||
# effective use of priors in conditional loglinear models,
|
||||
# and (iv) fine-grained modeling of unknown word features.
|
||||
#
|
||||
# Original paper: Toutanova, Manning, Klein and Singer.
|
||||
# 2003. Feature-Rich Part-of-Speech Tagging with a
|
||||
# Cyclic Dependency Network. In Proceedings of the
|
||||
# Conference of the North American Chapter of the
|
||||
# 2003. Feature-Rich Part-of-Speech Tagging with a
|
||||
# Cyclic Dependency Network. In Proceedings of the
|
||||
# Conference of the North American Chapter of the
|
||||
# Association for Computational Linguistics.
|
||||
class Treat::Workers::Lexicalizers::Taggers::Stanford
|
||||
|
||||
|
@ -25,34 +25,32 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|||
def self.tag(entity, options = {})
|
||||
|
||||
# Handle tags for sentences and phrases.
|
||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
||||
(entity.is_a?(Treat::Entities::Phrase) &&
|
||||
!entity.parent_sentence)
|
||||
if entity.is_a?(Treat::Entities::Group) &&
|
||||
!entity.parent_sentence
|
||||
|
||||
tag_set = options[:tag_set]
|
||||
entity.set :tag_set, tag_set
|
||||
end
|
||||
|
||||
if entity.is_a?(Treat::Entities::Sentence)
|
||||
return 'S'
|
||||
elsif entity.is_a?(Treat::Entities::Phrase)
|
||||
return 'P'
|
||||
end
|
||||
|
||||
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
||||
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
||||
return 'F' if entity.is_a?(Treat::Entities::Fragment)
|
||||
return 'G' if entity.is_a?(Treat::Entities::Group)
|
||||
|
||||
# Handle options and initialize the tagger.
|
||||
lang = entity.language
|
||||
options = get_options(options, lang)
|
||||
lang = entity.language.intern
|
||||
init_tagger(lang) unless @@taggers[lang]
|
||||
tokens, list = get_token_list(entity)
|
||||
options = get_options(options, lang)
|
||||
tokens, t_list = get_token_list(entity)
|
||||
|
||||
# Do the tagging.
|
||||
i = 0
|
||||
isolated_token = entity.is_a?(Treat::Entities::Token)
|
||||
|
||||
@@taggers[lang].apply(list).each do |tok|
|
||||
tokens[i].set :tag, tok.tag
|
||||
tokens[i].set :tag_set,
|
||||
options[:tag_set] if isolated_token
|
||||
@@taggers[lang].apply(t_list).each do |tok|
|
||||
tokens[i].set(:tag, tok.tag.split('-').first)
|
||||
tokens[i].set(:tag_set,
|
||||
options[:tag_set]) if isolated_token
|
||||
return tok.tag if isolated_token
|
||||
i += 1
|
||||
end
|
||||
|
@ -61,21 +59,24 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|||
|
||||
# Initialize the tagger for a language.
|
||||
def self.init_tagger(language)
|
||||
Treat::Loaders::Stanford.load(language)
|
||||
model = StanfordCoreNLP::Config::Models[:pos][language]
|
||||
model_path = Treat.libraries.stanford.model_path ||
|
||||
Treat.paths.models + 'stanford/'
|
||||
model = model_path + StanfordCoreNLP::
|
||||
Config::ModelFolders[:pos] + model
|
||||
@@taggers[language] ||=
|
||||
StanfordCoreNLP::MaxentTagger.new(model)
|
||||
unless @@taggers[language]
|
||||
Treat::Loaders::Stanford.load(language)
|
||||
unless StanfordCoreNLP.const_defined?('MaxentTagger')
|
||||
StanfordCoreNLP.load_class('MaxentTagger',
|
||||
'edu.stanford.nlp.tagger.maxent')
|
||||
end
|
||||
model = Treat::Loaders::Stanford.find_model(:pos,language)
|
||||
tagger = StanfordCoreNLP::MaxentTagger.new(model)
|
||||
@@taggers[language] = tagger
|
||||
end
|
||||
@@taggers[language]
|
||||
end
|
||||
|
||||
# Handle the options for the tagger.
|
||||
def self.get_options(options, language)
|
||||
options = DefaultOptions.merge(options)
|
||||
if options[:tagger_model]
|
||||
::StanfordCoreNLP.set_model('pos.model',
|
||||
StanfordCoreNLP.set_model('pos.model',
|
||||
options[:tagger_model])
|
||||
end
|
||||
options[:tag_set] =
|
||||
|
|
|
@ -2,16 +2,13 @@ class Treat::Workers::Processors::Chunkers::Autoselect
|
|||
|
||||
def self.chunk(entity, options = {})
|
||||
unless entity.has?(:format)
|
||||
raise Treat::Exception,
|
||||
"Must have a format to autoselect chunker."
|
||||
entity.set :format, 'txt'
|
||||
end
|
||||
begin
|
||||
k = Treat::Workers::Processors::
|
||||
Chunkers.const_get(entity.format.cc)
|
||||
k = Treat::Workers::Processors::Chunkers.const_get(entity.format.cc)
|
||||
k.chunk(entity, options)
|
||||
rescue Treat::Exception
|
||||
Treat::Workers::Processors::
|
||||
Chunkers::TXT.chunk(entity, options)
|
||||
Treat::Workers::Processors::Chunkers::TXT.chunk(entity, options)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -3,12 +3,9 @@ class Treat::Workers::Processors::Chunkers::HTML
|
|||
require 'nokogiri'
|
||||
|
||||
def self.chunk(entity, options = {})
|
||||
|
||||
entity.check_hasnt_children
|
||||
|
||||
doc = Nokogiri::HTML(entity.value)
|
||||
recurse(entity, doc)
|
||||
|
||||
self.recurse(entity, doc)
|
||||
end
|
||||
|
||||
def self.recurse(node, html_node, level = 1)
|
||||
|
@ -16,7 +13,6 @@ class Treat::Workers::Processors::Chunkers::HTML
|
|||
html_node.children.each do |child|
|
||||
|
||||
next if child.name == 'text'
|
||||
|
||||
txt = child.inner_text
|
||||
|
||||
if child.name =~ /^h([0-9]{1})$/ ||
|
||||
|
|
|
@ -12,16 +12,13 @@ class Treat::Workers::Processors::Chunkers::TXT
|
|||
zones.each do |zone|
|
||||
zone.strip!
|
||||
next if zone == ''
|
||||
c = Treat::Entities::
|
||||
Zone.from_string(zone)
|
||||
c = Treat::Entities::Zone.from_string(zone)
|
||||
if c.type == :title
|
||||
if current.type == :section
|
||||
current = current.parent
|
||||
current = entity << Treat::
|
||||
Entities::Section.new
|
||||
current = entity << Treat::Entities::Section.new
|
||||
else
|
||||
current = entity << Treat::
|
||||
Entities::Section.new
|
||||
current = entity << Treat::Entities::Section.new
|
||||
end
|
||||
end
|
||||
current << c
|
||||
|
|
|
@ -1,150 +1,88 @@
|
|||
# Parsing using an interface to a Java implementation
|
||||
# of probabilistic natural language parsers, both
|
||||
# optimized PCFG and lexicalized dependency parsers,
|
||||
# and a lexicalized PCFG parser.
|
||||
#
|
||||
# Original paper: Dan Klein and Christopher D.
|
||||
# Manning. 2003. Accurate Unlexicalized Parsing.
|
||||
# Proceedings of the 41st Meeting of the Association
|
||||
# Parsing using an interface to a Java implementation
|
||||
# of probabilistic natural language parsers, both
|
||||
# optimized PCFG and lexicalized dependency parsers,
|
||||
# and a lexicalized PCFG parser.
|
||||
#
|
||||
# Original paper: Dan Klein and Christopher D.
|
||||
# Manning. 2003. Accurate Unlexicalized Parsing.
|
||||
# Proceedings of the 41st Meeting of the Association
|
||||
# for Computational Linguistics, pp. 423-430.
|
||||
class Treat::Workers::Processors::Parsers::Stanford
|
||||
|
||||
|
||||
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
||||
|
||||
|
||||
# Hold one instance of the pipeline per language.
|
||||
@@parsers = {}
|
||||
|
||||
DefaultOptions = {
|
||||
:parser_model => nil,
|
||||
:tagger_model => nil
|
||||
}
|
||||
DefaultOptions = { model: nil }
|
||||
|
||||
# Parse the entity using the Stanford parser.
|
||||
#
|
||||
# Options:
|
||||
#
|
||||
# - (Boolean) :silent => whether to silence the output
|
||||
# of the JVM.
|
||||
# - (String) :log_file => a filename to log output to
|
||||
# instead of displaying it.
|
||||
def self.parse(entity, options = {})
|
||||
|
||||
entity.check_hasnt_children
|
||||
val, lang = entity.to_s, entity.language.intern
|
||||
|
||||
val = entity.to_s
|
||||
lang = entity.language
|
||||
init(lang, options)
|
||||
Treat::Loaders::Stanford.load(lang)
|
||||
|
||||
tag_set = StanfordCoreNLP::Config::TagSets[lang]
|
||||
|
||||
text = ::StanfordCoreNLP::Text.new(val)
|
||||
@@parsers[lang].annotate(text)
|
||||
|
||||
text.get(:sentences).each do |s|
|
||||
|
||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
||||
entity.is_a?(Treat::Entities::Phrase)
|
||||
tag = s.get(:category).to_s
|
||||
tag_s, tag_opt = *tag.split('-')
|
||||
tag_s ||= 'S'
|
||||
entity.set :tag, tag_s
|
||||
entity.set :tag_opt, tag_opt if tag_opt
|
||||
recurse(s.get(:tree).children[0], entity, tag_set)
|
||||
break #######
|
||||
else
|
||||
recurse(s.get(:tree), entity, tag_set)
|
||||
end
|
||||
|
||||
list = get_token_list(entity)
|
||||
entity.remove_all!
|
||||
|
||||
model_file = options[:model] ||
|
||||
StanfordCoreNLP::Config::Models[:parse][lang]
|
||||
|
||||
unless @@parsers[lang] && @@parsers[lang][model_file]
|
||||
model_path = Treat.libraries.stanford.model_path ||
|
||||
StanfordCoreNLP.model_path
|
||||
model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
|
||||
model = File.join(model_path, model_folder, model_file)
|
||||
@@parsers[lang] ||= {}
|
||||
options = StanfordCoreNLP::Options.new
|
||||
parser = StanfordCoreNLP::LexicalizedParser
|
||||
.getParserFromFile(model, options)
|
||||
@@parsers[lang][model_file] = parser
|
||||
end
|
||||
|
||||
|
||||
parser = @@parsers[lang][model_file]
|
||||
|
||||
text = parser.apply(list)
|
||||
|
||||
recurse(text.children[0], entity, tag_set)
|
||||
entity.set :tag_set, tag_set
|
||||
|
||||
|
||||
end
|
||||
|
||||
def self.init(lang, options)
|
||||
return if @@parsers[lang]
|
||||
def self.recurse(java_node, ruby_node, tag_set)
|
||||
|
||||
Treat::Loaders::Stanford.load(lang)
|
||||
|
||||
options = DefaultOptions.merge(options)
|
||||
StanfordCoreNLP.use(lang)
|
||||
if options[:tagger_model]
|
||||
::StanfordCoreNLP.set_model(
|
||||
'pos.model', options[:tagger_model]
|
||||
)
|
||||
end
|
||||
if options[:parser_model]
|
||||
::StanfordCoreNLP.set_model(
|
||||
'parser.model', options[:parser_model]
|
||||
)
|
||||
end
|
||||
@@parsers[lang] ||=
|
||||
::StanfordCoreNLP.load(
|
||||
:tokenize, :ssplit, :pos, :lemma, :parse
|
||||
)
|
||||
end
|
||||
java_node.children.each do |java_child|
|
||||
|
||||
# Helper method which recurses the tree supplied by
|
||||
# the Stanford parser.
|
||||
def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
|
||||
label = java_child.label
|
||||
tag = label.get(:category).to_s
|
||||
|
||||
if java_node.num_children == 0
|
||||
|
||||
label = java_node.label
|
||||
tag = label.get(:part_of_speech).to_s
|
||||
tag_s, tag_opt = *tag.split('-')
|
||||
tag_s ||= ''
|
||||
ruby_node.value = java_node.value.to_s.strip
|
||||
ruby_node.set :tag, tag_s
|
||||
ruby_node.set :tag_opt, tag_opt if tag_opt
|
||||
ruby_node.set :lemma, label.get(:lemma).to_s
|
||||
|
||||
additional_tags.each do |t|
|
||||
lt = label.get(t)
|
||||
ruby_node.set t, lt.to_s if lt
|
||||
end
|
||||
|
||||
ruby_node
|
||||
|
||||
else
|
||||
|
||||
if java_node.num_children == 1 &&
|
||||
java_node.children[0].num_children == 0
|
||||
recurse(java_node.children[0],
|
||||
ruby_node, tag_set, additional_tags)
|
||||
return
|
||||
end
|
||||
|
||||
java_node.children.each do |java_child|
|
||||
|
||||
label = java_child.label
|
||||
tag = label.get(:category).to_s
|
||||
tag_s, tag_opt = *tag.split('-')
|
||||
tag_s ||= ''
|
||||
|
||||
if Pttc[tag_s] && Pttc[tag_s][tag_set]
|
||||
ruby_child = Treat::Entities::Phrase.new
|
||||
else
|
||||
l = java_child.children[0].to_s
|
||||
v = java_child.children[0].value.to_s.strip
|
||||
|
||||
# Mhmhmhmhmhm
|
||||
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
||||
ruby_child = Treat::Entities::Token.from_string(val)
|
||||
end
|
||||
|
||||
ruby_child.set :tag, tag_s
|
||||
ruby_child.set :tag_opt, tag_opt if tag_opt
|
||||
if Pttc[tag] && Pttc[tag][tag_set]
|
||||
ruby_child = Treat::Entities::Phrase.new
|
||||
ruby_child.set :tag, tag
|
||||
ruby_node << ruby_child
|
||||
|
||||
unless java_child.children.empty?
|
||||
recurse(java_child, ruby_child, tag_set, additional_tags)
|
||||
recurse(java_child, ruby_child, tag_set)
|
||||
end
|
||||
|
||||
else
|
||||
val = java_child.children[0].to_s
|
||||
ruby_child = Treat::Entities::Token.from_string(val)
|
||||
ruby_child.set :tag, tag
|
||||
ruby_node << ruby_child
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
def self.get_token_list(entity)
|
||||
list = StanfordCoreNLP::ArrayList.new
|
||||
entity.tokens.each do |token|
|
||||
list.add(StanfordCoreNLP::Word.new(token.to_s))
|
||||
end
|
||||
list
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -13,7 +13,7 @@ class Treat::Workers::Processors::Segmenters::Punkt
|
|||
silence_warnings { require 'punkt-segmenter' }
|
||||
|
||||
# Require the YAML parser.
|
||||
silence_warnings { require 'psych' }
|
||||
# silence_warnings { require 'psych' }
|
||||
|
||||
# Hold one copy of the segmenter per language.
|
||||
@@segmenters = {}
|
||||
|
@ -87,7 +87,7 @@ class Treat::Workers::Processors::Segmenters::Punkt
|
|||
end
|
||||
end
|
||||
|
||||
t = ::Psych.load(File.read(model))
|
||||
t = ::YAML.load(File.read(model))
|
||||
|
||||
@@segmenters[lang] =
|
||||
::Punkt::SentenceTokenizer.new(t)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue