Compare commits
254 Commits
config-ref
...
master
Author | SHA1 | Date |
---|---|---|
![]() |
8c6450a0d7 | |
![]() |
d6a163f8f7 | |
![]() |
58911ce969 | |
![]() |
7b6e8fe6a8 | |
![]() |
f1f8c010a6 | |
![]() |
a86c3036cc | |
![]() |
e6c1f7c3a4 | |
![]() |
4dfaf83a14 | |
![]() |
eed75d3ba7 | |
![]() |
3dc184277d | |
![]() |
e218791c88 | |
![]() |
2d6891ab9c | |
![]() |
f61adc8187 | |
![]() |
cb7ae61cc5 | |
![]() |
394edb8fc4 | |
![]() |
721215b64a | |
![]() |
90fdcd8c9e | |
![]() |
3992f2c177 | |
![]() |
159f3b266e | |
![]() |
b7c6ae452b | |
![]() |
e753389911 | |
![]() |
1b783faf77 | |
![]() |
febc47a905 | |
![]() |
3969478f7f | |
![]() |
fc77587390 | |
![]() |
eb0ec83053 | |
![]() |
f8e5f8392a | |
![]() |
c813e72c2c | |
![]() |
316a6cec04 | |
![]() |
d45d82b3c4 | |
![]() |
78c13c280f | |
![]() |
690157af8b | |
![]() |
a8ce6b2f18 | |
![]() |
f7bd0c35d7 | |
![]() |
65235fb935 | |
![]() |
0ed8a37761 | |
![]() |
8f82086bb1 | |
![]() |
d9b912f24d | |
![]() |
10e6612a06 | |
![]() |
5fb11131c7 | |
![]() |
281934d572 | |
![]() |
c9bf91562b | |
![]() |
643937b231 | |
![]() |
fff57a4526 | |
![]() |
49e99ccd0d | |
![]() |
136a7cdbb3 | |
![]() |
a8448f8103 | |
![]() |
91e3571699 | |
![]() |
6464bb64a0 | |
![]() |
5c984a1dbb | |
![]() |
20e3aa1bb7 | |
![]() |
e483b764e4 | |
![]() |
3a367f7c56 | |
![]() |
c99dfac530 | |
![]() |
ee98b19960 | |
![]() |
bf6d951cf7 | |
![]() |
1b28d3640d | |
![]() |
727a307af0 | |
![]() |
cf8acd2b64 | |
![]() |
1aa4df9566 | |
![]() |
89cf494cba | |
![]() |
e2ef8ea748 | |
![]() |
038d62b12b | |
![]() |
767e1f7f45 | |
![]() |
4cbcf03d43 | |
![]() |
5e4189218c | |
![]() |
482ed30d76 | |
![]() |
b9ee19e276 | |
![]() |
fe7f97afd7 | |
![]() |
e135ffa466 | |
![]() |
a755272934 | |
![]() |
8b8a769d74 | |
![]() |
e2b813ca21 | |
![]() |
f72c8aec54 | |
![]() |
6a025ca4d6 | |
![]() |
1f69c3f089 | |
![]() |
4b74913bcd | |
![]() |
a34714cb99 | |
![]() |
b37ecdb695 | |
![]() |
01c3e50ef0 | |
![]() |
ba5a2d78ad | |
![]() |
c6ceac22d6 | |
![]() |
c18d0073f7 | |
![]() |
056ef129d6 | |
![]() |
6e644db50a | |
![]() |
d351108ad0 | |
![]() |
cbca3d8d37 | |
![]() |
ebf8a16c2a | |
![]() |
683861dfd1 | |
![]() |
2cded62abf | |
![]() |
e32f44f0a0 | |
![]() |
e87574c03d | |
![]() |
6cf37a2926 | |
![]() |
ff80d8ab2c | |
![]() |
fdb5abb748 | |
![]() |
931a66ec40 | |
![]() |
644248504d | |
![]() |
0731d33024 | |
![]() |
e8b12c5d03 | |
![]() |
cc2b8d6679 | |
![]() |
e284aee7eb | |
![]() |
8209447625 | |
![]() |
201ac69775 | |
![]() |
c42ee58bcd | |
![]() |
970a6248d5 | |
![]() |
7fb10de5cc | |
![]() |
577a063165 | |
![]() |
9fa4ee9c1e | |
![]() |
fd0733113a | |
![]() |
50ac0f67e0 | |
![]() |
c2532d6cd2 | |
![]() |
0abeb6ba14 | |
![]() |
d38387265b | |
![]() |
abf16b2b3e | |
![]() |
20b49cc471 | |
![]() |
3295529e97 | |
![]() |
5087b53623 | |
![]() |
e149c46bde | |
![]() |
9836dabe87 | |
![]() |
02ccf9942f | |
![]() |
c2a7a33fae | |
![]() |
e9dd1a9298 | |
![]() |
3484baa904 | |
![]() |
2b8e269380 | |
![]() |
28669ec6c6 | |
![]() |
5daa79c2f0 | |
![]() |
72eb214650 | |
![]() |
9f89a006a4 | |
![]() |
ec3465d24d | |
![]() |
b76a5c2c15 | |
![]() |
3e2a898155 | |
![]() |
7ef79deaa5 | |
![]() |
e13ff4dcab | |
![]() |
796ae59d77 | |
![]() |
9f54b6daba | |
![]() |
66f0cadc57 | |
![]() |
a7b4a7eea8 | |
![]() |
2d9ef825c2 | |
![]() |
0e06750ede | |
![]() |
2c6c4ec964 | |
![]() |
4d60ceb6ea | |
![]() |
b57d047ea8 | |
![]() |
10abcb7e20 | |
![]() |
1ab18f42c3 | |
![]() |
b6be6e7e1d | |
![]() |
4577430e84 | |
![]() |
6c46ebf707 | |
![]() |
01125b8a38 | |
![]() |
a7991bd009 | |
![]() |
6cc94ca6c5 | |
![]() |
6f443b3087 | |
![]() |
5b22b0d3b8 | |
![]() |
7c1b597619 | |
![]() |
2394de746d | |
![]() |
7f83e78a04 | |
![]() |
1c69d6558e | |
![]() |
da096bd15e | |
![]() |
1e47f41510 | |
![]() |
d3e52fb36a | |
![]() |
b5528a34f1 | |
![]() |
e4664586fe | |
![]() |
a49d1d97a0 | |
![]() |
fbd5e48fae | |
![]() |
ee3ed723bc | |
![]() |
476962e126 | |
![]() |
57e3c88d9e | |
![]() |
bb4910c43d | |
![]() |
a1d09cbb99 | |
![]() |
c8020c90e6 | |
![]() |
5ac537db5a | |
![]() |
2162339419 | |
![]() |
be96c54064 | |
![]() |
a116cc8a5b | |
![]() |
12cac5c769 | |
![]() |
1eb9341f77 | |
![]() |
22b2afdddf | |
![]() |
01d8f656b2 | |
![]() |
9af52ec798 | |
![]() |
30196bb699 | |
![]() |
f2edcf8118 | |
![]() |
b20836bc04 | |
![]() |
3fa0e8593b | |
![]() |
cd044f1c59 | |
![]() |
5144f43c63 | |
![]() |
f0d63fea20 | |
![]() |
7f93decac7 | |
![]() |
f6862357e6 | |
![]() |
19813f6a14 | |
![]() |
295e22417d | |
![]() |
9fb2c9a127 | |
![]() |
e5f8da4201 | |
![]() |
386d56de70 | |
![]() |
24e02802b9 | |
![]() |
18fce305ed | |
![]() |
49cfd6a6f7 | |
![]() |
5c1ca9d9f1 | |
![]() |
f7f53b0f7e | |
![]() |
a6b42ca20e | |
![]() |
d3f6df6266 | |
![]() |
b20a5dd412 | |
![]() |
d09dd391b7 | |
![]() |
a8170397ff | |
![]() |
c182ca8a42 | |
![]() |
8b8a1c93f3 | |
![]() |
ba7d0995d6 | |
![]() |
c7aa48e130 | |
![]() |
22f17a5b4c | |
![]() |
61d683603c | |
![]() |
3c925e579f | |
![]() |
f01f050f9d | |
![]() |
afefa10e87 | |
![]() |
d2a41c0dea | |
![]() |
651c8490f2 | |
![]() |
1355122e2f | |
![]() |
65311ceda9 | |
![]() |
0240847ce1 | |
![]() |
3ae63233c2 | |
![]() |
86db70f1e8 | |
![]() |
2c7aff0323 | |
![]() |
98b39ef541 | |
![]() |
f4975270b2 | |
![]() |
e73463c0d0 | |
![]() |
e995da0e71 | |
![]() |
aa99e71058 | |
![]() |
8baa98930a | |
![]() |
7997a5d6b7 | |
![]() |
c0e98efb76 | |
![]() |
be72e2499a | |
![]() |
2495d9cf05 | |
![]() |
e588051558 | |
![]() |
25689e5ef1 | |
![]() |
decb377e55 | |
![]() |
5535c03141 | |
![]() |
7a902900b2 | |
![]() |
69d7c9ff8a | |
![]() |
56902944c1 | |
![]() |
a713441546 | |
![]() |
d43e6be5c8 | |
![]() |
e8baae7513 | |
![]() |
f2c22b42f4 | |
![]() |
5a750834ae | |
![]() |
363eaca919 | |
![]() |
06913afa8a | |
![]() |
9e755535c6 | |
![]() |
4dd56750fb | |
![]() |
544a2a115e | |
![]() |
19a59eaae8 | |
![]() |
233894a5d8 | |
![]() |
a34fca4e88 | |
![]() |
a36bd07777 | |
![]() |
13175e8f25 | |
![]() |
7b30c26686 | |
![]() |
502b21797f | |
![]() |
e79c2dbdb8 |
|
@ -13,4 +13,5 @@
|
||||||
*.yaml
|
*.yaml
|
||||||
spec/sandbox.rb
|
spec/sandbox.rb
|
||||||
coverage/*
|
coverage/*
|
||||||
|
benchmark/*
|
||||||
TODO
|
TODO
|
11
.travis.yml
11
.travis.yml
|
@ -1,11 +1,18 @@
|
||||||
language: ruby
|
language: ruby
|
||||||
|
|
||||||
rvm:
|
rvm:
|
||||||
- 1.9.2
|
- 1.9.2
|
||||||
- 1.9.3
|
- 1.9.3
|
||||||
|
- 2.0
|
||||||
|
- 2.1
|
||||||
|
- 2.2
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- export "JAVA_HOME=/usr/lib/jvm/java-6-openjdk-i386/"
|
- export "JAVA_HOME=/usr/lib/jvm/java-6-openjdk-i386/"
|
||||||
before_script:
|
|
||||||
|
before_script:
|
||||||
- sudo apt-get install antiword
|
- sudo apt-get install antiword
|
||||||
- sudo apt-get install poppler-utils
|
- sudo apt-get install poppler-utils
|
||||||
- rake treat:install[travis] --trace
|
- rake treat:install[travis] --trace
|
||||||
script: rake treat:spec --trace
|
|
||||||
|
script: rake treat:spec --trace
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
# A boolean value indicating whether to silence
|
||||||
|
# the output of external libraries (e.g. Stanford
|
||||||
|
# tools, Enju, LDA, Ruby-FANN, Schiphol).
|
||||||
|
Treat.core.verbosity.silence = false
|
||||||
|
|
||||||
|
# A boolean value indicating whether to explain
|
||||||
|
# the steps that Treat is performing.
|
||||||
|
Treat.core.verbosity.debug = true
|
||||||
|
|
||||||
|
# A boolean value indicating whether Treat should
|
||||||
|
# try to detect the language of newly input text.
|
||||||
|
Treat.core.language.detect = false
|
||||||
|
|
||||||
|
# A string representing the language to default
|
||||||
|
# to when detection is off.
|
||||||
|
Treat.core.language.default = 'english'
|
||||||
|
|
||||||
|
# A symbol representing the finest level at which
|
||||||
|
# language detection should be performed if language
|
||||||
|
# detection is turned on.
|
||||||
|
Treat.core.language.detect_at = :document
|
||||||
|
|
||||||
|
# The directory containing executables and JAR files.
|
||||||
|
Treat.paths.bin = '##_INSTALLER_BIN_PATH_##'
|
||||||
|
|
||||||
|
# The directory containing trained models
|
||||||
|
Treat.paths.models = '##_INSTALLER_MODELS_PATH_##'
|
||||||
|
|
||||||
|
# Mongo database configuration.
|
||||||
|
Treat.databases.mongo.db = 'your_database'
|
||||||
|
Treat.databases.mongo.host = 'localhost'
|
||||||
|
Treat.databases.mongo.port = '27017'
|
||||||
|
|
||||||
|
# Include the DSL by default.
|
||||||
|
include Treat::Core::DSL
|
57
Gemfile
57
Gemfile
|
@ -1,48 +1,45 @@
|
||||||
source :rubygems
|
source 'https://rubygems.org'
|
||||||
|
|
||||||
gemspec
|
gemspec
|
||||||
|
|
||||||
gem 'birch'
|
gem 'birch'
|
||||||
gem 'schiphol'
|
gem 'schiphol'
|
||||||
gem 'sourcify'
|
gem 'yomu'
|
||||||
|
gem 'ruby-readability'
|
||||||
|
gem 'nokogiri'
|
||||||
|
|
||||||
group :test do
|
group :test do
|
||||||
gem 'rspec', '2.9.0'
|
gem 'rspec'
|
||||||
gem 'rake'
|
gem 'rake'
|
||||||
gem 'terminal-table'
|
gem 'terminal-table'
|
||||||
gem 'simplecov'
|
gem 'simplecov'
|
||||||
end
|
end
|
||||||
|
|
||||||
=begin
|
=begin
|
||||||
gem 'nokogiri'
|
|
||||||
gem 'psych'
|
|
||||||
gem 'mongoid'
|
|
||||||
gem 'mongo'
|
|
||||||
gem 'bson_ext'
|
|
||||||
|
|
||||||
gem 'zip'
|
|
||||||
gem 'ferret'
|
|
||||||
gem 'lda-ruby'
|
|
||||||
gem 'stanford-core-nlp'
|
|
||||||
gem 'linguistics'
|
gem 'linguistics'
|
||||||
gem 'ruby-readability'
|
gem 'engtagger'
|
||||||
gem 'whatlanguage'
|
gem 'open-nlp'
|
||||||
gem 'chronic'
|
gem 'stanford-core-nlp'
|
||||||
gem 'nickel'
|
gem 'rwordnet'
|
||||||
|
gem 'scalpel'
|
||||||
|
gem 'fastimage'
|
||||||
gem 'decisiontree'
|
gem 'decisiontree'
|
||||||
gem 'rb-libsvm'
|
gem 'whatlanguage'
|
||||||
gem 'ai4r'
|
gem 'zip'
|
||||||
|
gem 'nickel'
|
||||||
|
gem 'tactful_tokenizer'
|
||||||
|
gem 'srx-english'
|
||||||
|
gem 'punkt-segmenter'
|
||||||
|
gem 'chronic'
|
||||||
|
gem 'uea-stemmer'
|
||||||
gem 'rbtagger'
|
gem 'rbtagger'
|
||||||
gem 'ruby-stemmer'
|
gem 'ruby-stemmer'
|
||||||
gem 'punkt-segmenter'
|
|
||||||
gem 'tactful_tokenizer'
|
|
||||||
gem 'nickel'
|
|
||||||
gem 'rwordnet'
|
|
||||||
gem 'uea-stemmer'
|
|
||||||
gem 'engtagger'
|
|
||||||
gem 'activesupport'
|
gem 'activesupport'
|
||||||
gem 'srx-english'
|
gem 'rb-libsvm'
|
||||||
gem 'scalpel'
|
gem 'tomz-liblinear-ruby-swig'
|
||||||
=end
|
gem 'ruby-fann'
|
||||||
|
gem 'fuzzy-string-match'
|
||||||
# english?
|
gem 'levenshtein-ffi'
|
||||||
|
gem 'tf-idf-similarity'
|
||||||
|
gem 'kronic'
|
||||||
|
=end
|
4
LICENSE
4
LICENSE
|
@ -1,4 +1,4 @@
|
||||||
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.2
|
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 2.0.0
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -15,7 +15,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
||||||
|
|
||||||
Non-trivial amount of code has been incorporated and modified from other libraries:
|
A non-trivial amount of code has been incorporated and modified from other libraries:
|
||||||
|
|
||||||
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
||||||
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
||||||
|
|
44
README.md
44
README.md
|
@ -1,35 +1,43 @@
|
||||||
[](http://travis-ci.org/#!/louismullie/treat)
|
[](http://travis-ci.org/#!/louismullie/treat)
|
||||||
[](https://gemnasium.com/louismullie/treat)
|
[](https://codeclimate.com/github/louismullie/treat)
|
||||||
[](https://codeclimate.com/github/louismullie/treat)
|
|
||||||
|
|
||||||
Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition.
|

|
||||||
|
|
||||||
**Current features**
|
**New in v2.0.5: [OpenNLP integration](https://github.com/louismullie/treat/commit/727a307af0c64747619531c3aa355535edbf4632) and [Yomu support](https://github.com/louismullie/treat/commit/e483b764e4847e48b39e91a77af8a8baa1a1d056)**
|
||||||
|
|
||||||
|
Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition. Learn more by taking a [quick tour](https://github.com/louismullie/treat/wiki/Quick-Tour) or by reading the [manual](https://github.com/louismullie/treat/wiki/Manual).
|
||||||
|
|
||||||
|
**Features**
|
||||||
|
|
||||||
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
|
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
|
||||||
* Text retrieval with indexation and full-text search (Ferret).
|
|
||||||
* Text chunkers, sentence segmenters, tokenizers, and parsers (Stanford & Enju).
|
* Text chunkers, sentence segmenters, tokenizers, and parsers (Stanford & Enju).
|
||||||
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
|
||||||
* Lexical resources (WordNet interface, several POS taggers for English).
|
* Lexical resources (WordNet interface, several POS taggers for English).
|
||||||
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
||||||
|
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
||||||
* Serialization of annotated entities to YAML, XML or to MongoDB.
|
* Serialization of annotated entities to YAML, XML or to MongoDB.
|
||||||
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
|
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
|
||||||
* Linguistic resources, including language detection and tag alignments for several treebanks.
|
* Linguistic resources, including language detection and tag alignments for several treebanks.
|
||||||
* Machine learning (decision tree, multilayer perceptron, linear, support vector machines).
|
* Machine learning (decision tree, multilayer perceptron, LIBLINEAR, LIBSVM).
|
||||||
|
* Text retrieval with indexation and full-text search (Ferret).
|
||||||
|
|
||||||
<br>
|
**Contributing**
|
||||||
|
|
||||||
**Resources**
|
I am actively seeking developers that can help maintain and expand this project. You can find a list of ideas for contributing to the project [here](https://github.com/louismullie/treat/wiki/Contributing).
|
||||||
|
|
||||||
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
**Authors**
|
||||||
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
|
|
||||||
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
|
Lead developper: @louismullie [[Twitter](https://twitter.com/LouisMullie)]
|
||||||
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
|
|
||||||
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
Contributors:
|
||||||
* Open an [issue](https://github.com/louismullie/treat/issues).
|
|
||||||
|
- @bdigital
|
||||||
<br>
|
- @automatedtendencies
|
||||||
|
- @LeFnord
|
||||||
|
- @darkphantum
|
||||||
|
- @whistlerbrk
|
||||||
|
- @smileart
|
||||||
|
- @erol
|
||||||
|
|
||||||
**License**
|
**License**
|
||||||
|
|
||||||
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
||||||
|
|
7
RELEASE
7
RELEASE
|
@ -48,4 +48,9 @@ Treat - Text Retrieval, Extraction and Annotation Toolkit
|
||||||
* Added LIBSVM and LIBLINEAR classifier support.
|
* Added LIBSVM and LIBLINEAR classifier support.
|
||||||
* Added support for serialization of documents and data sets to MongoDB.
|
* Added support for serialization of documents and data sets to MongoDB.
|
||||||
* Added specs for most of the core classes.
|
* Added specs for most of the core classes.
|
||||||
* Several bug fixes.
|
* Several bug fixes.
|
||||||
|
|
||||||
|
2.0.0rc1
|
||||||
|
|
||||||
|
* MAJOR CHANGE: the old DSL is no longer supported. A new DSL style using
|
||||||
|
lowercase keywords is now used and must be required explicitly.
|
16
Rakefile
16
Rakefile
|
@ -40,20 +40,8 @@ namespace :treat do
|
||||||
task :spec, [:language] do |t, args|
|
task :spec, [:language] do |t, args|
|
||||||
require_relative 'spec/helper'
|
require_relative 'spec/helper'
|
||||||
Treat::Specs::Helper.start_coverage
|
Treat::Specs::Helper.start_coverage
|
||||||
Treat::Specs::Helper.run_core_specs
|
Treat::Specs::Helper.run_library_specs
|
||||||
Treat::Specs::Helper.run_examples_as(
|
Treat::Specs::Helper.run_language_specs(args.language)
|
||||||
'spec', args.language)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Runs worker benchmarks for all languages (by
|
|
||||||
# default), or for a specific language (if supplied).
|
|
||||||
# Also outputs an HTML table
|
|
||||||
# Syntax: rake treat:benchmark (all languages)
|
|
||||||
# - OR - rake treat:benchmark[language]
|
|
||||||
task :benchmark, [:language] do |t, args|
|
|
||||||
require_relative 'spec/helper'
|
|
||||||
Treat::Specs::Helper.run_examples_as(
|
|
||||||
'benchmark', args.language)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -18,5 +18,6 @@ module Treat
|
||||||
require_relative 'treat/exception'
|
require_relative 'treat/exception'
|
||||||
require_relative 'treat/autoload'
|
require_relative 'treat/autoload'
|
||||||
require_relative 'treat/modules'
|
require_relative 'treat/modules'
|
||||||
|
require_relative 'treat/builder'
|
||||||
|
|
||||||
end
|
end
|
|
@ -14,21 +14,31 @@ module Treat::Autoload
|
||||||
# Loads all the files for the base
|
# Loads all the files for the base
|
||||||
# module in the appropriate order.
|
# module in the appropriate order.
|
||||||
def self.included(base)
|
def self.included(base)
|
||||||
# Get the parts of module name.
|
m = self.get_module_name(base)
|
||||||
bits = base.to_s.split('::')
|
d = self.get_module_path(m)
|
||||||
# Singularize the module name.
|
n = self.singularize(m) + '.rb'
|
||||||
w = bits[-1].downcase
|
f, p = File.join(d, n), "#{d}/*.rb"
|
||||||
n = (w[-3..-1] == 'ies' ?
|
require f if File.readable?(f)
|
||||||
(w[0..-4] + 'y') : (w[-1] ==
|
Dir.glob(p).each { |f| require f }
|
||||||
's' ? w[0...-1] : w)) + '.rb'
|
end
|
||||||
# Get the module's directory.
|
|
||||||
d = File.dirname(File.
|
# Returns the path to a module's dir.
|
||||||
expand_path(__FILE__))[0..-6] +
|
def self.get_module_path(name)
|
||||||
bits.join('/').downcase + '/'
|
file = File.expand_path(__FILE__)
|
||||||
# Require base class if exists.
|
dirs = File.dirname(file).split('/')
|
||||||
require d + n if File.readable?(d + n)
|
File.join(*dirs[0..-1], name)
|
||||||
# Require all other files in dir.
|
end
|
||||||
Dir.glob("#{d}*.rb").each { |f| require f }
|
|
||||||
|
# Return the downcased form of the
|
||||||
|
# module's last name (e.g. "entities").
|
||||||
|
def self.get_module_name(mod)
|
||||||
|
mod.to_s.split('::')[-1].downcase
|
||||||
|
end
|
||||||
|
|
||||||
|
# Helper method to singularize words.
|
||||||
|
def self.singularize(w)
|
||||||
|
if w[-3..-1] == 'ies'; w[0..-4] + 'y'
|
||||||
|
else; (w[-1] == 's' ? w[0..-2] : w); end
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -0,0 +1,6 @@
|
||||||
|
class Treat::Builder
|
||||||
|
include Treat::Core::DSL
|
||||||
|
def initialize(&block)
|
||||||
|
instance_exec(&block)
|
||||||
|
end
|
||||||
|
end
|
|
@ -3,71 +3,36 @@
|
||||||
# the /config folder.
|
# the /config folder.
|
||||||
module Treat::Config
|
module Treat::Config
|
||||||
|
|
||||||
|
# Require configurable mix in.
|
||||||
|
require_relative 'importable'
|
||||||
|
|
||||||
|
# Make all configuration importable.
|
||||||
|
extend Treat::Config::Importable
|
||||||
|
|
||||||
|
# Core configuration options for entities.
|
||||||
class Treat::Config::Entities; end
|
class Treat::Config::Entities; end
|
||||||
|
|
||||||
|
# Configuration for paths to models, binaries,
|
||||||
|
# temporary storage and file downloads.
|
||||||
class Treat::Config::Paths; end
|
class Treat::Config::Paths; end
|
||||||
|
|
||||||
|
# Configuration for all Treat workers.
|
||||||
class Treat::Config::Workers; end
|
class Treat::Config::Workers; end
|
||||||
|
|
||||||
|
# Helpful linguistic options.
|
||||||
class Treat::Config::Linguistics; end
|
class Treat::Config::Linguistics; end
|
||||||
|
|
||||||
|
# Supported workers for each language.
|
||||||
class Treat::Config::Languages; end
|
class Treat::Config::Languages; end
|
||||||
|
|
||||||
|
# Configuration options for external libraries.
|
||||||
class Treat::Config::Libraries; end
|
class Treat::Config::Libraries; end
|
||||||
|
|
||||||
class Treat::Config::Workers; end
|
|
||||||
|
|
||||||
|
# Configuration options for database
|
||||||
|
# connectivity (host, port, etc.)
|
||||||
class Treat::Config::Databases; end
|
class Treat::Config::Databases; end
|
||||||
|
|
||||||
|
# Configuration options for Treat core.
|
||||||
class Treat::Config::Core; end
|
class Treat::Config::Core; end
|
||||||
|
|
||||||
# Require autolodable mix in.
|
|
||||||
require_relative 'configurable'
|
|
||||||
|
|
||||||
# Store all the configuration in self.config
|
|
||||||
class << self; attr_accessor :config; end
|
|
||||||
|
|
||||||
# Setup a proxy on the main Treat module to
|
|
||||||
# make configuration options directly accessible,
|
|
||||||
# using e.g. Treat.paths.tmp = '...'
|
|
||||||
Treat.module_eval do
|
|
||||||
# Handle all missing methods as conf options.
|
|
||||||
# Instead, should dynamically define them. FIXME.
|
|
||||||
def self.method_missing(sym, *args, &block)
|
|
||||||
super(sym, *args, &block) if sym == :to_ary
|
|
||||||
Treat::Config.config[sym]
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Main function; loads all configuration options.
|
|
||||||
def self.configure!
|
|
||||||
config = {}
|
|
||||||
Treat::Config.constants.each do |const|
|
|
||||||
unless const == :Configurable
|
|
||||||
klass = Treat::Config.const_get(const)
|
|
||||||
klass.class_eval do
|
|
||||||
extend Treat::Config::Configurable
|
|
||||||
end
|
|
||||||
k = const.to_s.downcase.intern
|
|
||||||
klass.configure!
|
|
||||||
config[k] = klass.config
|
|
||||||
end
|
|
||||||
end
|
|
||||||
self.config = self.hash_to_struct(config)
|
|
||||||
end
|
|
||||||
|
|
||||||
# * Helper methods * #
|
|
||||||
|
|
||||||
# Convert a hash to nested structs.
|
|
||||||
def self.hash_to_struct(hash)
|
|
||||||
return hash if hash.keys.
|
|
||||||
select { |k| !k.is_a?(Symbol) }.size > 0
|
|
||||||
struct = Struct.new(*hash.keys).new(*hash.values)
|
|
||||||
hash.each do |key, value|
|
|
||||||
if value.is_a?(Hash)
|
|
||||||
struct[key] = self.hash_to_struct(value)
|
|
||||||
end
|
|
||||||
end; return struct
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,10 +1,29 @@
|
||||||
|
# Provide default functionality to load configuration
|
||||||
|
# options from flat files into their respective modules.
|
||||||
module Treat::Config::Configurable
|
module Treat::Config::Configurable
|
||||||
|
|
||||||
|
# When extended, add the .config property to
|
||||||
|
# the class that is being operated on.
|
||||||
def self.extended(base)
|
def self.extended(base)
|
||||||
class << base; attr_accessor :config; end
|
class << base; attr_accessor :config; end
|
||||||
base.class_eval { self.config = {} }
|
base.class_eval { self.config = {} }
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Provide base functionality to configure
|
||||||
|
# all modules. The behaviour is as follows:
|
||||||
|
#
|
||||||
|
# 1 - Check if a file named data/$CLASS$.rb
|
||||||
|
# exists; if so, load that file as the base
|
||||||
|
# configuration, i.e. "Treat.$CLASS$"; e.g.
|
||||||
|
# "Treat.core"
|
||||||
|
#
|
||||||
|
# 2 - Check if a folder named data/$CLASS$
|
||||||
|
# exists; if so, load each file in that folder
|
||||||
|
# as a suboption of the main configuration,
|
||||||
|
# i.e. "Treat.$CLASS$.$FILE$"; e.g. "Treat.workers"
|
||||||
|
#
|
||||||
|
# (where $CLASS$ is the lowercase name of
|
||||||
|
# the concrete class being extended by this.)
|
||||||
def configure!
|
def configure!
|
||||||
path = File.dirname(File.expand_path( # FIXME
|
path = File.dirname(File.expand_path( # FIXME
|
||||||
__FILE__)).split('/')[0..-4].join('/') + '/'
|
__FILE__)).split('/')[0..-4].join('/') + '/'
|
||||||
|
@ -14,15 +33,19 @@ module Treat::Config::Configurable
|
||||||
base_file = main_dir + mod_name + '.rb'
|
base_file = main_dir + mod_name + '.rb'
|
||||||
if File.readable?(base_file)
|
if File.readable?(base_file)
|
||||||
self.config = eval(File.read(base_file))
|
self.config = eval(File.read(base_file))
|
||||||
end
|
elsif FileTest.directory?(conf_dir)
|
||||||
if FileTest.directory?(conf_dir)
|
self.config = self.from_dir(conf_dir)
|
||||||
config = {}
|
else; raise Treat::Exception,
|
||||||
Dir[conf_dir + '/*'].each do |path|
|
"No config file found for #{mod_name}."
|
||||||
name = File.basename(path, '.*').intern
|
|
||||||
config[name] = eval(File.read(path))
|
|
||||||
end
|
|
||||||
self.config = config
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# * Helper methods for configuraton * #
|
||||||
|
def from_dir(conf_dir)
|
||||||
|
Hash[Dir[conf_dir + '/*'].map do |path|
|
||||||
|
name = File.basename(path, '.*').intern
|
||||||
|
[name, eval(File.read(path))]
|
||||||
|
end]
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,50 +0,0 @@
|
||||||
{acronyms:
|
|
||||||
['xml', 'html', 'txt', 'odt',
|
|
||||||
'abw', 'doc', 'yaml', 'uea',
|
|
||||||
'lda', 'pdf', 'ptb', 'dot',
|
|
||||||
'ai', 'id3', 'svo', 'mlp',
|
|
||||||
'svm', 'srx'],
|
|
||||||
|
|
||||||
encodings:
|
|
||||||
{language_to_code: {
|
|
||||||
arabic: 'UTF-8',
|
|
||||||
chinese: 'GB18030',
|
|
||||||
english: 'UTF-8',
|
|
||||||
french: 'ISO_8859-1',
|
|
||||||
ferman: 'ISO_8859-1',
|
|
||||||
hebrew: 'UTF-8'
|
|
||||||
}},
|
|
||||||
|
|
||||||
entities:
|
|
||||||
{list:
|
|
||||||
[:entity, :unknown, :email,
|
|
||||||
:url, :symbol, :sentence,
|
|
||||||
:punctuation, :number,
|
|
||||||
:enclitic, :word, :token,
|
|
||||||
:fragment, :phrase, :paragraph,
|
|
||||||
:title, :zone, :list, :block,
|
|
||||||
:page, :section, :collection,
|
|
||||||
:document],
|
|
||||||
order:
|
|
||||||
[:token, :fragment, :phrase,
|
|
||||||
:sentence, :zone, :section,
|
|
||||||
:document, :collection]},
|
|
||||||
language: {
|
|
||||||
default: :english,
|
|
||||||
detect: false,
|
|
||||||
detect_at: :document
|
|
||||||
},
|
|
||||||
paths: {
|
|
||||||
description: {
|
|
||||||
tmp: 'temporary files',
|
|
||||||
lib: 'class and module definitions',
|
|
||||||
bin: 'binary files',
|
|
||||||
files: 'user-saved files',
|
|
||||||
models: 'model files',
|
|
||||||
spec: 'spec test files'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
syntax: { sweetened: false },
|
|
||||||
|
|
||||||
verbosity: { debug: false, silence: true}}
|
|
|
@ -4,7 +4,7 @@
|
||||||
'abw', 'doc', 'yaml', 'uea',
|
'abw', 'doc', 'yaml', 'uea',
|
||||||
'lda', 'pdf', 'ptb', 'dot',
|
'lda', 'pdf', 'ptb', 'dot',
|
||||||
'ai', 'id3', 'svo', 'mlp',
|
'ai', 'id3', 'svo', 'mlp',
|
||||||
'svm', 'srx'],
|
'svm', 'srx', 'nlp'],
|
||||||
|
|
||||||
encodings:
|
encodings:
|
||||||
{language_to_code: {
|
{language_to_code: {
|
||||||
|
@ -21,13 +21,13 @@
|
||||||
[:entity, :unknown, :email,
|
[:entity, :unknown, :email,
|
||||||
:url, :symbol, :sentence,
|
:url, :symbol, :sentence,
|
||||||
:punctuation, :number,
|
:punctuation, :number,
|
||||||
:enclitic, :word, :token,
|
:enclitic, :word, :token, :group,
|
||||||
:fragment, :phrase, :paragraph,
|
:fragment, :phrase, :paragraph,
|
||||||
:title, :zone, :list, :block,
|
:title, :zone, :list, :block,
|
||||||
:page, :section, :collection,
|
:page, :section, :collection,
|
||||||
:document],
|
:document],
|
||||||
order:
|
order:
|
||||||
[:token, :fragment, :phrase,
|
[:token, :fragment, :group,
|
||||||
:sentence, :zone, :section,
|
:sentence, :zone, :section,
|
||||||
:document, :collection]},
|
:document, :collection]},
|
||||||
language: {
|
language: {
|
||||||
|
@ -45,7 +45,9 @@
|
||||||
spec: 'spec test files'
|
spec: 'spec test files'
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
learning: {
|
||||||
|
list: [:data_set, :export, :feature, :tag, :problem, :question]
|
||||||
|
},
|
||||||
syntax: { sweetened: false },
|
syntax: { sweetened: false },
|
||||||
|
|
||||||
verbosity: { debug: false, silence: true}
|
verbosity: { debug: false, silence: true}
|
||||||
|
|
|
@ -1,21 +1,12 @@
|
||||||
{
|
{
|
||||||
dependencies: [
|
dependencies: [
|
||||||
'psych',
|
'ferret', 'bson_ext', 'mongo', 'lda-ruby',
|
||||||
'nokogiri',
|
'stanford-core-nlp', 'linguistics',
|
||||||
'ferret',
|
'ruby-readability', 'whatlanguage',
|
||||||
'bson_ext',
|
'chronic', 'kronic', 'nickel', 'decisiontree',
|
||||||
'mongo',
|
'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
|
||||||
'lda-ruby',
|
'tf-idf-similarity', 'narray', 'fastimage',
|
||||||
'stanford-core-nlp',
|
'fuzzy-string-match', 'levenshtein-ffi'
|
||||||
'linguistics',
|
|
||||||
'ruby-readability',
|
|
||||||
'whatlanguage',
|
|
||||||
'chronic',
|
|
||||||
'nickel',
|
|
||||||
'decisiontree',
|
|
||||||
'rb-libsvm',
|
|
||||||
'ai4r',
|
|
||||||
'zip'
|
|
||||||
],
|
],
|
||||||
workers: {
|
workers: {
|
||||||
learners: {
|
learners: {
|
||||||
|
@ -25,7 +16,9 @@
|
||||||
keywords: [:tf_idf],
|
keywords: [:tf_idf],
|
||||||
language: [:what_language],
|
language: [:what_language],
|
||||||
topic_words: [:lda],
|
topic_words: [:lda],
|
||||||
tf_idf: [:native]
|
tf_idf: [:native],
|
||||||
|
distance: [:levenshtein],
|
||||||
|
similarity: [:jaro_winkler, :tf_idf]
|
||||||
},
|
},
|
||||||
formatters: {
|
formatters: {
|
||||||
serializers: [:xml, :yaml, :mongo],
|
serializers: [:xml, :yaml, :mongo],
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
],
|
],
|
||||||
workers: {
|
workers: {
|
||||||
extractors: {
|
extractors: {
|
||||||
time: [:chronic, :ruby, :nickel],
|
time: [:chronic, :kronic, :ruby, :nickel],
|
||||||
topics: [:reuters],
|
topics: [:reuters],
|
||||||
name_tag: [:stanford]
|
name_tag: [:stanford]
|
||||||
},
|
},
|
||||||
|
@ -32,28 +32,64 @@
|
||||||
},
|
},
|
||||||
processors: {
|
processors: {
|
||||||
parsers: [:stanford],
|
parsers: [:stanford],
|
||||||
segmenters: [:srx, :tactful, :punkt, :stanford, :scalpel],
|
segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
|
||||||
tokenizers: [:ptb, :stanford, :punkt]
|
tokenizers: [:ptb, :stanford, :punkt, :open_nlp]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
stop_words:
|
stop_words:
|
||||||
['the', 'of', 'and', 'a', 'to', 'in', 'is',
|
[
|
||||||
'you', 'that', 'it', 'he', 'was', 'for', 'on',
|
"about",
|
||||||
'are', 'as', 'with', 'his', 'they', 'I', 'at',
|
"also",
|
||||||
'be', 'this', 'have', 'from', 'or', 'one', 'had',
|
"are",
|
||||||
'by', 'word', 'but', 'not', 'what', 'all', 'were',
|
"away",
|
||||||
'we', 'when', 'your', 'can', 'said', 'there', 'use',
|
"because",
|
||||||
'an', 'each', 'which', 'she', 'do', 'how', 'their',
|
"been",
|
||||||
'if', 'will', 'up', 'other', 'about', 'out', 'many',
|
"beside",
|
||||||
'then', 'them', 'these', 'so', 'some', 'her', 'would',
|
"besides",
|
||||||
'make', 'like', 'him', 'into', 'time', 'has', 'look',
|
"between",
|
||||||
'two', 'more', 'write', 'go', 'see', 'number', 'no',
|
"but",
|
||||||
'way', 'could', 'people', 'my', 'than', 'first', 'been',
|
"cannot",
|
||||||
'call', 'who', 'its', 'now', 'find', 'long', 'down',
|
"could",
|
||||||
'day', 'did', 'get', 'come', 'made', 'may', 'part',
|
"did",
|
||||||
'say', 'also', 'new', 'much', 'should', 'still',
|
"etc",
|
||||||
'such', 'before', 'after', 'other', 'then', 'over',
|
"even",
|
||||||
'under', 'therefore', 'nonetheless', 'thereafter',
|
"ever",
|
||||||
'afterwards', 'here', 'huh', 'hah', "n't", "'t", 'here',
|
"every",
|
||||||
'neither', 'towards']
|
"for",
|
||||||
|
"had",
|
||||||
|
"have",
|
||||||
|
"how",
|
||||||
|
"into",
|
||||||
|
"isn",
|
||||||
|
"maybe",
|
||||||
|
"non",
|
||||||
|
"nor",
|
||||||
|
"now",
|
||||||
|
"should",
|
||||||
|
"such",
|
||||||
|
"than",
|
||||||
|
"that",
|
||||||
|
"then",
|
||||||
|
"these",
|
||||||
|
"this",
|
||||||
|
"those",
|
||||||
|
"though",
|
||||||
|
"too",
|
||||||
|
"was",
|
||||||
|
"wasn",
|
||||||
|
"were",
|
||||||
|
"what",
|
||||||
|
"when",
|
||||||
|
"where",
|
||||||
|
"which",
|
||||||
|
"while",
|
||||||
|
"who",
|
||||||
|
"whom",
|
||||||
|
"whose",
|
||||||
|
"will",
|
||||||
|
"with",
|
||||||
|
"would",
|
||||||
|
"wouldn",
|
||||||
|
"yes"
|
||||||
|
]
|
||||||
}
|
}
|
|
@ -6,13 +6,143 @@
|
||||||
],
|
],
|
||||||
workers: {
|
workers: {
|
||||||
processors: {
|
processors: {
|
||||||
segmenters: [:punkt],
|
segmenters: [:scalpel],
|
||||||
tokenizers: [],
|
tokenizers: [:ptb,:stanford],
|
||||||
parsers: [:stanford]
|
parsers: [:stanford]
|
||||||
},
|
},
|
||||||
lexicalizers: {
|
lexicalizers: {
|
||||||
taggers: [:stanford],
|
taggers: [:stanford],
|
||||||
categorizers: [:from_tag]
|
categorizers: [:from_tag]
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
stop_words:
|
||||||
|
[
|
||||||
|
"ailleurs",
|
||||||
|
"ainsi",
|
||||||
|
"alors",
|
||||||
|
"aucun",
|
||||||
|
"aucune",
|
||||||
|
"auquel",
|
||||||
|
"aurai",
|
||||||
|
"auras",
|
||||||
|
"aurez",
|
||||||
|
"aurons",
|
||||||
|
"auront",
|
||||||
|
"aussi",
|
||||||
|
"autre",
|
||||||
|
"autres",
|
||||||
|
"aux",
|
||||||
|
"auxquelles",
|
||||||
|
"auxquels",
|
||||||
|
"avaient",
|
||||||
|
"avais",
|
||||||
|
"avait",
|
||||||
|
"avec",
|
||||||
|
"avez",
|
||||||
|
"aviez",
|
||||||
|
"avoir",
|
||||||
|
"avons",
|
||||||
|
"celui",
|
||||||
|
"cependant",
|
||||||
|
"certaine",
|
||||||
|
"certaines",
|
||||||
|
"certains",
|
||||||
|
"ces",
|
||||||
|
"cet",
|
||||||
|
"cette",
|
||||||
|
"ceux",
|
||||||
|
"chacun",
|
||||||
|
"chacune",
|
||||||
|
"chaque",
|
||||||
|
"comme",
|
||||||
|
"constamment",
|
||||||
|
"davantage",
|
||||||
|
"depuis",
|
||||||
|
"des",
|
||||||
|
"desquelles",
|
||||||
|
"desquels",
|
||||||
|
"dessous",
|
||||||
|
"dessus",
|
||||||
|
"donc",
|
||||||
|
"dont",
|
||||||
|
"duquel",
|
||||||
|
"egalement",
|
||||||
|
"elles",
|
||||||
|
"encore",
|
||||||
|
"enfin",
|
||||||
|
"ensuite",
|
||||||
|
"etaient",
|
||||||
|
"etais",
|
||||||
|
"etait",
|
||||||
|
"etes",
|
||||||
|
"etiez",
|
||||||
|
"etions",
|
||||||
|
"etre",
|
||||||
|
"eux",
|
||||||
|
"guere",
|
||||||
|
"ici",
|
||||||
|
"ils",
|
||||||
|
"jamais",
|
||||||
|
"jusqu",
|
||||||
|
"laquelle",
|
||||||
|
"legerement",
|
||||||
|
"lequel",
|
||||||
|
"les",
|
||||||
|
"lesquelles",
|
||||||
|
"lesquels",
|
||||||
|
"leur",
|
||||||
|
"leurs",
|
||||||
|
"lors",
|
||||||
|
"lui",
|
||||||
|
"maintenant",
|
||||||
|
"mais",
|
||||||
|
"malgre",
|
||||||
|
"moi",
|
||||||
|
"moins",
|
||||||
|
"notamment",
|
||||||
|
"parce",
|
||||||
|
"plupart",
|
||||||
|
"pourtant",
|
||||||
|
"presentement",
|
||||||
|
"presque",
|
||||||
|
"puis",
|
||||||
|
"puisque",
|
||||||
|
"quand",
|
||||||
|
"quant",
|
||||||
|
"que",
|
||||||
|
"quel",
|
||||||
|
"quelqu",
|
||||||
|
"quelque",
|
||||||
|
"quelques",
|
||||||
|
"qui",
|
||||||
|
"quoi",
|
||||||
|
"quoique",
|
||||||
|
"rien",
|
||||||
|
"selon",
|
||||||
|
"serai",
|
||||||
|
"seras",
|
||||||
|
"serez",
|
||||||
|
"serons",
|
||||||
|
"seront",
|
||||||
|
"soient",
|
||||||
|
"soit",
|
||||||
|
"sommes",
|
||||||
|
"sont",
|
||||||
|
"sous",
|
||||||
|
"suis",
|
||||||
|
"telle",
|
||||||
|
"telles",
|
||||||
|
"tels",
|
||||||
|
"toi",
|
||||||
|
"toujours",
|
||||||
|
"tout",
|
||||||
|
"toutes",
|
||||||
|
"tres",
|
||||||
|
"trop",
|
||||||
|
"une",
|
||||||
|
"vos",
|
||||||
|
"votre",
|
||||||
|
"vous"
|
||||||
|
]
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,3 +1,5 @@
|
||||||
|
#encoding: UTF-8
|
||||||
|
|
||||||
{
|
{
|
||||||
dependencies: [
|
dependencies: [
|
||||||
'punkt-segmenter',
|
'punkt-segmenter',
|
||||||
|
@ -6,13 +8,130 @@
|
||||||
],
|
],
|
||||||
workers: {
|
workers: {
|
||||||
processors: {
|
processors: {
|
||||||
segmenters: [:punkt],
|
segmenters: [:tactful, :punkt, :stanford, :scalpel],
|
||||||
tokenizers: [],
|
tokenizers: [:stanford, :punkt],
|
||||||
parsers: [:stanford]
|
parsers: [:stanford]
|
||||||
},
|
},
|
||||||
lexicalizers: {
|
lexicalizers: {
|
||||||
taggers: [:stanford],
|
taggers: [:stanford],
|
||||||
categorizers: [:from_tag]
|
categorizers: [:from_tag]
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
}
|
stop_words:
|
||||||
|
[
|
||||||
|
"alle",
|
||||||
|
"allem",
|
||||||
|
"alles",
|
||||||
|
"andere",
|
||||||
|
"anderem",
|
||||||
|
"anderen",
|
||||||
|
"anderer",
|
||||||
|
"anderes",
|
||||||
|
"auf",
|
||||||
|
"bei",
|
||||||
|
"beim",
|
||||||
|
"bist",
|
||||||
|
"dadurch",
|
||||||
|
"dein",
|
||||||
|
"deine",
|
||||||
|
"deiner",
|
||||||
|
"deines",
|
||||||
|
"deins",
|
||||||
|
"dem",
|
||||||
|
"denen",
|
||||||
|
"der",
|
||||||
|
"deren",
|
||||||
|
"des",
|
||||||
|
"deshalb",
|
||||||
|
"dessen",
|
||||||
|
"diese",
|
||||||
|
"diesem",
|
||||||
|
"diesen",
|
||||||
|
"dieser",
|
||||||
|
"dieses",
|
||||||
|
"ein",
|
||||||
|
"eine",
|
||||||
|
"einem",
|
||||||
|
"einen",
|
||||||
|
"einer",
|
||||||
|
"eines",
|
||||||
|
"euer",
|
||||||
|
"euere",
|
||||||
|
"eueren",
|
||||||
|
"eueres",
|
||||||
|
"für",
|
||||||
|
"haben",
|
||||||
|
"habt",
|
||||||
|
"hatte",
|
||||||
|
"hatten",
|
||||||
|
"hattest",
|
||||||
|
"hattet",
|
||||||
|
"hierzu",
|
||||||
|
"hinter",
|
||||||
|
"ich",
|
||||||
|
"ihr",
|
||||||
|
"ihre",
|
||||||
|
"ihren",
|
||||||
|
"ihrer",
|
||||||
|
"ihres",
|
||||||
|
"indem",
|
||||||
|
"ist",
|
||||||
|
"jede",
|
||||||
|
"jedem",
|
||||||
|
"jeden",
|
||||||
|
"jeder",
|
||||||
|
"jedes",
|
||||||
|
"kann",
|
||||||
|
"kannst",
|
||||||
|
"können",
|
||||||
|
"könnt",
|
||||||
|
"konnte",
|
||||||
|
"konnten",
|
||||||
|
"konntest",
|
||||||
|
"konntet",
|
||||||
|
"mehr",
|
||||||
|
"mein",
|
||||||
|
"meine",
|
||||||
|
"meiner",
|
||||||
|
"meines",
|
||||||
|
"meins",
|
||||||
|
"nach",
|
||||||
|
"neben",
|
||||||
|
"nicht",
|
||||||
|
"nichts",
|
||||||
|
"seid",
|
||||||
|
"sein",
|
||||||
|
"seine",
|
||||||
|
"seiner",
|
||||||
|
"seines",
|
||||||
|
"seins",
|
||||||
|
"sie",
|
||||||
|
"sind",
|
||||||
|
"über",
|
||||||
|
"und",
|
||||||
|
"uns",
|
||||||
|
"unser",
|
||||||
|
"unsere",
|
||||||
|
"unter",
|
||||||
|
"vor",
|
||||||
|
"warst",
|
||||||
|
"weil",
|
||||||
|
"wenn",
|
||||||
|
"werde",
|
||||||
|
"werden",
|
||||||
|
"werdet",
|
||||||
|
"willst",
|
||||||
|
"wir",
|
||||||
|
"wird",
|
||||||
|
"wirst",
|
||||||
|
"wollen",
|
||||||
|
"wollt",
|
||||||
|
"wollte",
|
||||||
|
"wollten",
|
||||||
|
"wolltest",
|
||||||
|
"wolltet",
|
||||||
|
"zum",
|
||||||
|
"zur"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,5 +8,155 @@
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: []
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
}
|
stop_words:
|
||||||
|
[
|
||||||
|
"affinche",
|
||||||
|
"alcun",
|
||||||
|
"alcuna",
|
||||||
|
"alcune",
|
||||||
|
"alcuni",
|
||||||
|
"alcuno",
|
||||||
|
"allora",
|
||||||
|
"altra",
|
||||||
|
"altre",
|
||||||
|
"altri",
|
||||||
|
"altro",
|
||||||
|
"anziche",
|
||||||
|
"certa",
|
||||||
|
"certe",
|
||||||
|
"certi",
|
||||||
|
"certo",
|
||||||
|
"che",
|
||||||
|
"chi",
|
||||||
|
"chiunque",
|
||||||
|
"comunque",
|
||||||
|
"con",
|
||||||
|
"cosa",
|
||||||
|
"cose",
|
||||||
|
"cui",
|
||||||
|
"dagli",
|
||||||
|
"dai",
|
||||||
|
"dall",
|
||||||
|
"dalla",
|
||||||
|
"dalle",
|
||||||
|
"darsi",
|
||||||
|
"degli",
|
||||||
|
"del",
|
||||||
|
"dell",
|
||||||
|
"della",
|
||||||
|
"delle",
|
||||||
|
"dello",
|
||||||
|
"dunque",
|
||||||
|
"egli",
|
||||||
|
"eppure",
|
||||||
|
"esse",
|
||||||
|
"essi",
|
||||||
|
"forse",
|
||||||
|
"gia",
|
||||||
|
"infatti",
|
||||||
|
"inoltre",
|
||||||
|
"invece",
|
||||||
|
"lui",
|
||||||
|
"malgrado",
|
||||||
|
"mediante",
|
||||||
|
"meno",
|
||||||
|
"mentre",
|
||||||
|
"mie",
|
||||||
|
"miei",
|
||||||
|
"mio",
|
||||||
|
"modo",
|
||||||
|
"molta",
|
||||||
|
"molte",
|
||||||
|
"molti",
|
||||||
|
"molto",
|
||||||
|
"negli",
|
||||||
|
"nel",
|
||||||
|
"nella",
|
||||||
|
"nelle",
|
||||||
|
"nessun",
|
||||||
|
"nessuna",
|
||||||
|
"nessuno",
|
||||||
|
"niente",
|
||||||
|
"noi",
|
||||||
|
"nostra",
|
||||||
|
"nostre",
|
||||||
|
"nostri",
|
||||||
|
"nostro",
|
||||||
|
"nulla",
|
||||||
|
"occorre",
|
||||||
|
"ogni",
|
||||||
|
"ognuno",
|
||||||
|
"oltre",
|
||||||
|
"oltretutto",
|
||||||
|
"oppure",
|
||||||
|
"ovunque",
|
||||||
|
"ovvio",
|
||||||
|
"percio",
|
||||||
|
"pertanto",
|
||||||
|
"piu",
|
||||||
|
"piuttosto",
|
||||||
|
"poca",
|
||||||
|
"poco",
|
||||||
|
"poiche",
|
||||||
|
"propri",
|
||||||
|
"proprie",
|
||||||
|
"proprio",
|
||||||
|
"puo",
|
||||||
|
"qua",
|
||||||
|
"qual",
|
||||||
|
"qualche",
|
||||||
|
"qualcuna",
|
||||||
|
"qualcuno",
|
||||||
|
"quale",
|
||||||
|
"quali",
|
||||||
|
"qualunque",
|
||||||
|
"quando",
|
||||||
|
"quant",
|
||||||
|
"quante",
|
||||||
|
"quanti",
|
||||||
|
"quanto",
|
||||||
|
"quantunque",
|
||||||
|
"quegli",
|
||||||
|
"quei",
|
||||||
|
"quest",
|
||||||
|
"questa",
|
||||||
|
"queste",
|
||||||
|
"questi",
|
||||||
|
"questo",
|
||||||
|
"qui",
|
||||||
|
"quindi",
|
||||||
|
"sebbene",
|
||||||
|
"sembra",
|
||||||
|
"sempre",
|
||||||
|
"senza",
|
||||||
|
"soltanto",
|
||||||
|
"stessa",
|
||||||
|
"stesse",
|
||||||
|
"stessi",
|
||||||
|
"stesso",
|
||||||
|
"sugli",
|
||||||
|
"sui",
|
||||||
|
"sul",
|
||||||
|
"sull",
|
||||||
|
"sulla",
|
||||||
|
"sulle",
|
||||||
|
"suo",
|
||||||
|
"suoi",
|
||||||
|
"taluni",
|
||||||
|
"taluno",
|
||||||
|
"tanta",
|
||||||
|
"tanti",
|
||||||
|
"tanto",
|
||||||
|
"tra",
|
||||||
|
"tuo",
|
||||||
|
"tuoi",
|
||||||
|
"tutt",
|
||||||
|
"tutta",
|
||||||
|
"tutte",
|
||||||
|
"tutto",
|
||||||
|
"una",
|
||||||
|
"uno",
|
||||||
|
"voi"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
|
@ -8,5 +8,284 @@
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: []
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
stop_words:
|
||||||
|
[
|
||||||
|
"abans",
|
||||||
|
"aca",
|
||||||
|
"acerca",
|
||||||
|
"ahora",
|
||||||
|
"aixo",
|
||||||
|
"algo",
|
||||||
|
"algu",
|
||||||
|
"alguien",
|
||||||
|
"algun",
|
||||||
|
"alguna",
|
||||||
|
"algunas",
|
||||||
|
"algunes",
|
||||||
|
"alguno",
|
||||||
|
"algunos",
|
||||||
|
"alguns",
|
||||||
|
"alla",
|
||||||
|
"alli",
|
||||||
|
"allo",
|
||||||
|
"altra",
|
||||||
|
"altre",
|
||||||
|
"altres",
|
||||||
|
"amb",
|
||||||
|
"amunt",
|
||||||
|
"antes",
|
||||||
|
"aquel",
|
||||||
|
"aquell",
|
||||||
|
"aquella",
|
||||||
|
"aquellas",
|
||||||
|
"aquelles",
|
||||||
|
"aquellos",
|
||||||
|
"aquells",
|
||||||
|
"aquest",
|
||||||
|
"aquesta",
|
||||||
|
"aquestes",
|
||||||
|
"aquests",
|
||||||
|
"aqui",
|
||||||
|
"asimismo",
|
||||||
|
"aun",
|
||||||
|
"aunque",
|
||||||
|
"avall",
|
||||||
|
"cada",
|
||||||
|
"casi",
|
||||||
|
"com",
|
||||||
|
"como",
|
||||||
|
"con",
|
||||||
|
"cosas",
|
||||||
|
"coses",
|
||||||
|
"cual",
|
||||||
|
"cuales",
|
||||||
|
"cualquier",
|
||||||
|
"cuando",
|
||||||
|
"damunt",
|
||||||
|
"darrera",
|
||||||
|
"davant",
|
||||||
|
"debe",
|
||||||
|
"deben",
|
||||||
|
"deber",
|
||||||
|
"debia",
|
||||||
|
"debian",
|
||||||
|
"decia",
|
||||||
|
"decian",
|
||||||
|
"decir",
|
||||||
|
"deia",
|
||||||
|
"deien",
|
||||||
|
"del",
|
||||||
|
"demasiado",
|
||||||
|
"des",
|
||||||
|
"desde",
|
||||||
|
"despues",
|
||||||
|
"dicen",
|
||||||
|
"diciendo",
|
||||||
|
"dins",
|
||||||
|
"dir",
|
||||||
|
"diu",
|
||||||
|
"diuen",
|
||||||
|
"doncs",
|
||||||
|
"ell",
|
||||||
|
"ellas",
|
||||||
|
"elles",
|
||||||
|
"ells",
|
||||||
|
"els",
|
||||||
|
"encara",
|
||||||
|
"entonces",
|
||||||
|
"ese",
|
||||||
|
"esos",
|
||||||
|
"esser",
|
||||||
|
"esta",
|
||||||
|
"estan",
|
||||||
|
"estando",
|
||||||
|
"estant",
|
||||||
|
"estar",
|
||||||
|
"estaria",
|
||||||
|
"estarian",
|
||||||
|
"estarien",
|
||||||
|
"estas",
|
||||||
|
"estos",
|
||||||
|
"farien",
|
||||||
|
"feia",
|
||||||
|
"feien",
|
||||||
|
"fent",
|
||||||
|
"fue",
|
||||||
|
"fueron",
|
||||||
|
"gaire",
|
||||||
|
"gairebe",
|
||||||
|
"hace",
|
||||||
|
"hacia",
|
||||||
|
"hacian",
|
||||||
|
"haciendo",
|
||||||
|
"haran",
|
||||||
|
"hauria",
|
||||||
|
"haurien",
|
||||||
|
"hemos",
|
||||||
|
"hola",
|
||||||
|
"junto",
|
||||||
|
"lejos",
|
||||||
|
"les",
|
||||||
|
"lloc",
|
||||||
|
"los",
|
||||||
|
"menos",
|
||||||
|
"menys",
|
||||||
|
"meva",
|
||||||
|
"mias",
|
||||||
|
"mio",
|
||||||
|
"misma",
|
||||||
|
"mismas",
|
||||||
|
"mismo",
|
||||||
|
"mismos",
|
||||||
|
"molt",
|
||||||
|
"molta",
|
||||||
|
"moltes",
|
||||||
|
"mon",
|
||||||
|
"mucha",
|
||||||
|
"mucho",
|
||||||
|
"muy",
|
||||||
|
"nadie",
|
||||||
|
"ningu",
|
||||||
|
"nomes",
|
||||||
|
"nosaltres",
|
||||||
|
"nosotros",
|
||||||
|
"nostra",
|
||||||
|
"nostre",
|
||||||
|
"nuestra",
|
||||||
|
"nuestras",
|
||||||
|
"nuestro",
|
||||||
|
"nuestros",
|
||||||
|
"nunca",
|
||||||
|
"otra",
|
||||||
|
"pasa",
|
||||||
|
"pasan",
|
||||||
|
"pasara",
|
||||||
|
"pasaria",
|
||||||
|
"passara",
|
||||||
|
"passaria",
|
||||||
|
"passen",
|
||||||
|
"perque",
|
||||||
|
"poc",
|
||||||
|
"pocas",
|
||||||
|
"pocos",
|
||||||
|
"podem",
|
||||||
|
"poden",
|
||||||
|
"podeu",
|
||||||
|
"podria",
|
||||||
|
"podrian",
|
||||||
|
"podrien",
|
||||||
|
"poques",
|
||||||
|
"porque",
|
||||||
|
"potser",
|
||||||
|
"puc",
|
||||||
|
"pudieron",
|
||||||
|
"pudo",
|
||||||
|
"puede",
|
||||||
|
"pueden",
|
||||||
|
"puesto",
|
||||||
|
"qualsevol",
|
||||||
|
"quan",
|
||||||
|
"que",
|
||||||
|
"queria",
|
||||||
|
"querian",
|
||||||
|
"qui",
|
||||||
|
"quien",
|
||||||
|
"quienes",
|
||||||
|
"quiere",
|
||||||
|
"quieren",
|
||||||
|
"quin",
|
||||||
|
"quina",
|
||||||
|
"quines",
|
||||||
|
"quins",
|
||||||
|
"quizas",
|
||||||
|
"segueent",
|
||||||
|
"segun",
|
||||||
|
"sempre",
|
||||||
|
"seran",
|
||||||
|
"seria",
|
||||||
|
"serian",
|
||||||
|
"seu",
|
||||||
|
"seva",
|
||||||
|
"sido",
|
||||||
|
"siempre",
|
||||||
|
"siendo",
|
||||||
|
"siguiente",
|
||||||
|
"sino",
|
||||||
|
"sobretodo",
|
||||||
|
"solamente",
|
||||||
|
"sovint",
|
||||||
|
"suya",
|
||||||
|
"suyas",
|
||||||
|
"suyo",
|
||||||
|
"suyos",
|
||||||
|
"tambe",
|
||||||
|
"tambien",
|
||||||
|
"tanmateix",
|
||||||
|
"tanta",
|
||||||
|
"tanto",
|
||||||
|
"tendran",
|
||||||
|
"tendria",
|
||||||
|
"tendrian",
|
||||||
|
"tenen",
|
||||||
|
"teu",
|
||||||
|
"teva",
|
||||||
|
"tiene",
|
||||||
|
"tienen",
|
||||||
|
"tindran",
|
||||||
|
"tindria",
|
||||||
|
"tindrien",
|
||||||
|
"toda",
|
||||||
|
"todavia",
|
||||||
|
"todo",
|
||||||
|
"tota",
|
||||||
|
"totes",
|
||||||
|
"tras",
|
||||||
|
"traves",
|
||||||
|
"tuvieron",
|
||||||
|
"tuvo",
|
||||||
|
"tuya",
|
||||||
|
"tuyas",
|
||||||
|
"tuyo",
|
||||||
|
"tuyos",
|
||||||
|
"unas",
|
||||||
|
"unes",
|
||||||
|
"unos",
|
||||||
|
"uns",
|
||||||
|
"usaba",
|
||||||
|
"usaban",
|
||||||
|
"usada",
|
||||||
|
"usades",
|
||||||
|
"usado",
|
||||||
|
"usan",
|
||||||
|
"usando",
|
||||||
|
"usant",
|
||||||
|
"usar",
|
||||||
|
"usat",
|
||||||
|
"usava",
|
||||||
|
"usaven",
|
||||||
|
"usen",
|
||||||
|
"vaig",
|
||||||
|
"varem",
|
||||||
|
"varen",
|
||||||
|
"vareu",
|
||||||
|
"vegada",
|
||||||
|
"vegades",
|
||||||
|
"vez",
|
||||||
|
"volem",
|
||||||
|
"volen",
|
||||||
|
"voleu",
|
||||||
|
"vora",
|
||||||
|
"vos",
|
||||||
|
"vosaltres",
|
||||||
|
"vosotros",
|
||||||
|
"vostra",
|
||||||
|
"vostre",
|
||||||
|
"voy",
|
||||||
|
"vuestra",
|
||||||
|
"vuestras",
|
||||||
|
"vuestro",
|
||||||
|
"vuestros",
|
||||||
|
"vull"
|
||||||
|
]
|
||||||
}
|
}
|
|
@ -8,5 +8,282 @@
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: []
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
}
|
stop_words:
|
||||||
|
[
|
||||||
|
"atminstone",
|
||||||
|
"an",
|
||||||
|
"anda",
|
||||||
|
"aven",
|
||||||
|
"aldrig",
|
||||||
|
"alla",
|
||||||
|
"alls",
|
||||||
|
"allt",
|
||||||
|
"alltid",
|
||||||
|
"allting",
|
||||||
|
"alltsa",
|
||||||
|
"andra",
|
||||||
|
"annan",
|
||||||
|
"annars",
|
||||||
|
"antingen",
|
||||||
|
"att",
|
||||||
|
"bakom",
|
||||||
|
"bland",
|
||||||
|
"blev",
|
||||||
|
"bli",
|
||||||
|
"bliva",
|
||||||
|
"blivit",
|
||||||
|
"bort",
|
||||||
|
"bortom",
|
||||||
|
"bredvid",
|
||||||
|
"dar",
|
||||||
|
"darav",
|
||||||
|
"darefter",
|
||||||
|
"darfor",
|
||||||
|
"dari",
|
||||||
|
"darigenom",
|
||||||
|
"darvid",
|
||||||
|
"dedar",
|
||||||
|
"definitivt",
|
||||||
|
"del",
|
||||||
|
"den",
|
||||||
|
"dendar",
|
||||||
|
"denhar",
|
||||||
|
"denna",
|
||||||
|
"deras",
|
||||||
|
"dessa",
|
||||||
|
"dessutom",
|
||||||
|
"desto",
|
||||||
|
"det",
|
||||||
|
"detta",
|
||||||
|
"dylik",
|
||||||
|
"efterat",
|
||||||
|
"efter",
|
||||||
|
"eftersom",
|
||||||
|
"eller",
|
||||||
|
"emellertid",
|
||||||
|
"enbart",
|
||||||
|
"endast",
|
||||||
|
"enligt",
|
||||||
|
"ens",
|
||||||
|
"ensam",
|
||||||
|
"envar",
|
||||||
|
"eran",
|
||||||
|
"etc",
|
||||||
|
"ett",
|
||||||
|
"exakt",
|
||||||
|
"fatt",
|
||||||
|
"fastan",
|
||||||
|
"fick",
|
||||||
|
"fler",
|
||||||
|
"flera",
|
||||||
|
"foljande",
|
||||||
|
"foljde",
|
||||||
|
"foljer",
|
||||||
|
"for",
|
||||||
|
"fore",
|
||||||
|
"forhoppningsvis",
|
||||||
|
"formodligen",
|
||||||
|
"forr",
|
||||||
|
"forra",
|
||||||
|
"forutom",
|
||||||
|
"forvisso",
|
||||||
|
"fran",
|
||||||
|
"framfor",
|
||||||
|
"fullstandigt",
|
||||||
|
"gang",
|
||||||
|
"gar",
|
||||||
|
"gatt",
|
||||||
|
"ganska",
|
||||||
|
"gav",
|
||||||
|
"genom",
|
||||||
|
"genomgaende",
|
||||||
|
"ger",
|
||||||
|
"gick",
|
||||||
|
"gjorde",
|
||||||
|
"gjort",
|
||||||
|
"gor",
|
||||||
|
"hade",
|
||||||
|
"har",
|
||||||
|
"harav",
|
||||||
|
"har",
|
||||||
|
"hej",
|
||||||
|
"hela",
|
||||||
|
"helst",
|
||||||
|
"helt",
|
||||||
|
"hitta",
|
||||||
|
"hon",
|
||||||
|
"honom",
|
||||||
|
"hur",
|
||||||
|
"huruvida",
|
||||||
|
"huvudsakligen",
|
||||||
|
"ibland",
|
||||||
|
"icke",
|
||||||
|
"ickedestomindre",
|
||||||
|
"igen",
|
||||||
|
"ihop",
|
||||||
|
"inat",
|
||||||
|
"ingen",
|
||||||
|
"ingenstans",
|
||||||
|
"inget",
|
||||||
|
"innan",
|
||||||
|
"innehalla",
|
||||||
|
"inre",
|
||||||
|
"inte",
|
||||||
|
"inuti",
|
||||||
|
"istaellet",
|
||||||
|
"kanske",
|
||||||
|
"klart",
|
||||||
|
"knappast",
|
||||||
|
"knappt",
|
||||||
|
"kom",
|
||||||
|
"komma",
|
||||||
|
"kommer",
|
||||||
|
"kraver",
|
||||||
|
"kunde",
|
||||||
|
"kunna",
|
||||||
|
"lata",
|
||||||
|
"later",
|
||||||
|
"lagga",
|
||||||
|
"langre",
|
||||||
|
"laet",
|
||||||
|
"lagd",
|
||||||
|
"leta",
|
||||||
|
"letar",
|
||||||
|
"manga",
|
||||||
|
"maste",
|
||||||
|
"med",
|
||||||
|
"medan",
|
||||||
|
"medans",
|
||||||
|
"mellan",
|
||||||
|
"mest",
|
||||||
|
"min",
|
||||||
|
"mindre",
|
||||||
|
"minst",
|
||||||
|
"mittemellan",
|
||||||
|
"motsvarande",
|
||||||
|
"mycket",
|
||||||
|
"nagon",
|
||||||
|
"nagongang",
|
||||||
|
"nagonsin",
|
||||||
|
"nagonstans",
|
||||||
|
"nagonting",
|
||||||
|
"nagorlunda",
|
||||||
|
"nagot",
|
||||||
|
"namligen",
|
||||||
|
"nar",
|
||||||
|
"nara",
|
||||||
|
"nasta",
|
||||||
|
"nastan",
|
||||||
|
"nedat",
|
||||||
|
"nedanfor",
|
||||||
|
"nerat",
|
||||||
|
"ner",
|
||||||
|
"nog",
|
||||||
|
"normalt",
|
||||||
|
"nummer",
|
||||||
|
"nuvarande",
|
||||||
|
"nytt",
|
||||||
|
"oavsett",
|
||||||
|
"och",
|
||||||
|
"ocksa",
|
||||||
|
"oppna",
|
||||||
|
"over",
|
||||||
|
"overallt",
|
||||||
|
"ofta",
|
||||||
|
"okej",
|
||||||
|
"olika",
|
||||||
|
"ovanfor",
|
||||||
|
"ratt",
|
||||||
|
"redan",
|
||||||
|
"relativt",
|
||||||
|
"respektive",
|
||||||
|
"rimlig",
|
||||||
|
"rimligen",
|
||||||
|
"rimligt",
|
||||||
|
"salunda",
|
||||||
|
"savida",
|
||||||
|
"saga",
|
||||||
|
"sager",
|
||||||
|
"sakert",
|
||||||
|
"sand",
|
||||||
|
"sarskilt",
|
||||||
|
"satt",
|
||||||
|
"sak",
|
||||||
|
"samma",
|
||||||
|
"samtliga",
|
||||||
|
"sedd",
|
||||||
|
"senare",
|
||||||
|
"senaste",
|
||||||
|
"ser",
|
||||||
|
"sig",
|
||||||
|
"sista",
|
||||||
|
"sjaelv",
|
||||||
|
"ska",
|
||||||
|
"skall",
|
||||||
|
"skickad",
|
||||||
|
"skriva",
|
||||||
|
"skulle",
|
||||||
|
"snabb",
|
||||||
|
"snarare",
|
||||||
|
"snart",
|
||||||
|
"som",
|
||||||
|
"somliga",
|
||||||
|
"speciellt",
|
||||||
|
"stalla",
|
||||||
|
"stallet",
|
||||||
|
"starta",
|
||||||
|
"strax",
|
||||||
|
"stundom",
|
||||||
|
"tackar",
|
||||||
|
"tanka",
|
||||||
|
"taga",
|
||||||
|
"tagen",
|
||||||
|
"tala",
|
||||||
|
"tanke",
|
||||||
|
"tidigare",
|
||||||
|
"tills",
|
||||||
|
"tog",
|
||||||
|
"totalt",
|
||||||
|
"trolig",
|
||||||
|
"troligen",
|
||||||
|
"tvaers",
|
||||||
|
"tvars",
|
||||||
|
"tycka",
|
||||||
|
"tyckte",
|
||||||
|
"tyvarr",
|
||||||
|
"understundom",
|
||||||
|
"upp",
|
||||||
|
"uppenbarligen",
|
||||||
|
"uppenbart",
|
||||||
|
"utan",
|
||||||
|
"utanfor",
|
||||||
|
"uteslutande",
|
||||||
|
"utom",
|
||||||
|
"var",
|
||||||
|
"varan",
|
||||||
|
"vad",
|
||||||
|
"val",
|
||||||
|
"varde",
|
||||||
|
"vanlig",
|
||||||
|
"vanligen",
|
||||||
|
"var",
|
||||||
|
"vare",
|
||||||
|
"varenda",
|
||||||
|
"varfor",
|
||||||
|
"varifran",
|
||||||
|
"varit",
|
||||||
|
"varje",
|
||||||
|
"varken",
|
||||||
|
"vars",
|
||||||
|
"vart",
|
||||||
|
"vem",
|
||||||
|
"verkligen",
|
||||||
|
"vidare",
|
||||||
|
"vilken",
|
||||||
|
"vill",
|
||||||
|
"visar",
|
||||||
|
"visst",
|
||||||
|
"visste"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
|
@ -8,5 +8,9 @@
|
||||||
stanford: {
|
stanford: {
|
||||||
jar_path: nil,
|
jar_path: nil,
|
||||||
model_path: nil
|
model_path: nil
|
||||||
|
},
|
||||||
|
open_nlp: {
|
||||||
|
jar_path: nil,
|
||||||
|
model_path: nil
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -24,8 +24,9 @@
|
||||||
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
||||||
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
||||||
'Verb phrase', ['', '', 'VP', '', '', ''],
|
'Verb phrase', ['', '', 'VP', '', '', ''],
|
||||||
|
'Inverted yes/no question', ['', '', 'SQ', '', '', ''],
|
||||||
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
||||||
'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
|
'Wh adverb phrase', ['', '', 'WHADVP', '', '', ''],
|
||||||
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
||||||
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
||||||
'Unknown', ['', '', 'X', '', '', ''],
|
'Unknown', ['', '', 'X', '', '', ''],
|
||||||
|
@ -100,7 +101,7 @@
|
||||||
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
||||||
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
||||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
||||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
|
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # FIXME
|
||||||
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
||||||
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
||||||
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
||||||
|
@ -181,7 +182,7 @@
|
||||||
|
|
||||||
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
||||||
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
||||||
'Punctuationm, comma', ['PUN', ',', ',', '$,'],
|
'Punctuation, comma', ['PUN', ',', ',', '$,'],
|
||||||
'Punctuation, dash', ['PUN', '-', '-'],
|
'Punctuation, dash', ['PUN', '-', '-'],
|
||||||
'Punctuation, dollar sign', ['PUN', '', '$'],
|
'Punctuation, dollar sign', ['PUN', '', '$'],
|
||||||
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
||||||
|
@ -324,4 +325,4 @@
|
||||||
['SQ', 'Inverted yes/no question']
|
['SQ', 'Inverted yes/no question']
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,5 +27,13 @@
|
||||||
tf_idf: {
|
tf_idf: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:word]
|
targets: [:word]
|
||||||
|
},
|
||||||
|
similarity: {
|
||||||
|
type: :computer,
|
||||||
|
targets: [:entity]
|
||||||
|
},
|
||||||
|
distance: {
|
||||||
|
type: :computer,
|
||||||
|
targets: [:entity]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
{
|
{
|
||||||
taggers: {
|
taggers: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:phrase, :token]
|
targets: [:group, :token],
|
||||||
|
recursive: true
|
||||||
},
|
},
|
||||||
categorizers: {
|
categorizers: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:phrase, :token],
|
targets: [:group, :token],
|
||||||
recursive: true
|
recursive: true
|
||||||
},
|
},
|
||||||
sensers: {
|
sensers: {
|
||||||
|
@ -14,5 +15,5 @@
|
||||||
preset_option: :nym,
|
preset_option: :nym,
|
||||||
presets: [:synonyms, :antonyms,
|
presets: [:synonyms, :antonyms,
|
||||||
:hyponyms, :hypernyms],
|
:hyponyms, :hypernyms],
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
# Mixin that is extended by Treat::Config
|
||||||
|
# in order to provide a single point of
|
||||||
|
# access method to trigger the import.
|
||||||
|
module Treat::Config::Importable
|
||||||
|
|
||||||
|
# Import relies on each configuration.
|
||||||
|
require_relative 'configurable'
|
||||||
|
|
||||||
|
# Store all the configuration in self.config
|
||||||
|
def self.extended(base)
|
||||||
|
class << base; attr_accessor :config; end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Main function; loads all configuration options.
|
||||||
|
def import!
|
||||||
|
config, c = {}, Treat::Config::Configurable
|
||||||
|
definition = :define_singleton_method
|
||||||
|
Treat::Config.constants.each do |const|
|
||||||
|
next if const.to_s.downcase.is_mixin?
|
||||||
|
klass = Treat::Config.const_get(const)
|
||||||
|
klass.class_eval { extend c }.configure!
|
||||||
|
name = const.to_s.downcase.intern
|
||||||
|
config[name] = klass.config
|
||||||
|
Treat.send(definition, name) do
|
||||||
|
Treat::Config.config[name]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
self.config = config.to_struct
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,9 +1,13 @@
|
||||||
|
# Generates the following path config options:
|
||||||
|
# Treat.paths.tmp, Treat.paths.bin, Treat.paths.lib,
|
||||||
|
# Treat.paths.models, Treat.paths.files, Treat.paths.spec.
|
||||||
class Treat::Config::Paths
|
class Treat::Config::Paths
|
||||||
|
|
||||||
# Get the path configuration based on the
|
# Get the path configuration based on the
|
||||||
# directory structure loaded into Paths.
|
# directory structure loaded into Paths.
|
||||||
|
# Note that this doesn't call super, as
|
||||||
|
# there is no external config files to load.
|
||||||
def self.configure!
|
def self.configure!
|
||||||
super
|
|
||||||
root = File.dirname(File.expand_path( # FIXME
|
root = File.dirname(File.expand_path( # FIXME
|
||||||
__FILE__)).split('/')[0..-4].join('/') + '/'
|
__FILE__)).split('/')[0..-4].join('/') + '/'
|
||||||
self.config = Hash[
|
self.config = Hash[
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
# Handles all configuration related
|
||||||
|
# to understanding of part of speech
|
||||||
|
# and phrasal tags.
|
||||||
class Treat::Config::Tags
|
class Treat::Config::Tags
|
||||||
|
|
||||||
# Load and align tags.
|
|
||||||
# Generate a map of word and phrase tags
|
# Generate a map of word and phrase tags
|
||||||
# to their syntactic category, keyed by
|
# to their syntactic category, keyed by
|
||||||
# tag set.
|
# tag set.
|
||||||
|
@ -16,21 +18,20 @@ class Treat::Config::Tags
|
||||||
align_tags(phrase_tags, tag_sets)
|
align_tags(phrase_tags, tag_sets)
|
||||||
self.config[:aligned] = config
|
self.config[:aligned] = config
|
||||||
end
|
end
|
||||||
|
|
||||||
# * Helper methods for tag set config * #
|
# Helper methods for tag set config.
|
||||||
|
# Align tag tags in the tag set
|
||||||
# Align tag tags in the tag set
|
def self.align_tags(tags, tag_sets)
|
||||||
def self.align_tags(tags, tag_sets)
|
wttc = {}
|
||||||
wttc = {}
|
tags.each_slice(2) do |desc, tags|
|
||||||
tags.each_slice(2) do |desc, tags|
|
category = desc.gsub(',', ' ,').
|
||||||
category = desc.gsub(',', ' ,').
|
split(' ')[0].downcase
|
||||||
split(' ')[0].downcase
|
tag_sets.each_with_index do |tag_set, i|
|
||||||
tag_sets.each_with_index do |tag_set, i|
|
next unless tags[i]
|
||||||
next unless tags[i]
|
wttc[tags[i]] ||= {}
|
||||||
wttc[tags[i]] ||= {}
|
wttc[tags[i]][tag_set] = category
|
||||||
wttc[tags[i]][tag_set] = category
|
end
|
||||||
end
|
end; return wttc
|
||||||
end; return wttc
|
end
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,36 +1,21 @@
|
||||||
module Treat::Core::DSL
|
module Treat::Core::DSL
|
||||||
|
|
||||||
# Include DSL on base.
|
|
||||||
def self.included(base)
|
|
||||||
self.sweeten_entities(base)
|
|
||||||
self.sweeten_learning(base)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Map all classes in Treat::Entities to
|
# Map all classes in Treat::Entities to
|
||||||
# a global builder function (Entity, etc.)
|
# a global builder function (entity, word,
|
||||||
def self.sweeten_entities(base, on = true)
|
# phrase, punctuation, symbol, list, etc.)
|
||||||
Treat.core.entities.list.each do |type|
|
def self.included(base)
|
||||||
next if type == :Symbol
|
def method_missing(sym,*args,&block)
|
||||||
kname = type.cc.intern
|
@@entities ||= Treat.core.entities.list
|
||||||
klass = Treat::Entities.const_get(kname)
|
@@learning ||= Treat.core.learning.list
|
||||||
Object.class_eval do
|
if @@entities.include?(sym)
|
||||||
define_method(kname) do |val, opts={}|
|
klass = Treat::Entities.const_get(sym.cc)
|
||||||
klass.build(val, opts)
|
return klass.build(*args)
|
||||||
end if on
|
elsif @@learning.include?(sym)
|
||||||
remove_method(name) if !on
|
klass = Treat::Learning.const_get(sym.cc)
|
||||||
end
|
return klass.new(*args)
|
||||||
end
|
else
|
||||||
end
|
super(sym,*args,&block)
|
||||||
|
raise "Uncaught method ended up in Treat DSL."
|
||||||
# Map all classes in the Learning module
|
|
||||||
# to a global builder function (e.g. DataSet).
|
|
||||||
def self.sweeten_learning(base, on = true)
|
|
||||||
Treat::Learning.constants.each do |kname|
|
|
||||||
Object.class_eval do
|
|
||||||
define_method(kname) do |*args|
|
|
||||||
Treat::Learning.const_get(kname).new(*args)
|
|
||||||
end if on
|
|
||||||
remove_method(name) if !on
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# A dependency manager for Treat language plugins.
|
# A dependency manager for Treat language plugins.
|
||||||
# Usage: Treat::Installer.install('language')
|
# Usage: Treat::Installer.install('language')
|
||||||
module Treat::Core::Installer
|
module Treat::Core::Installer
|
||||||
|
|
||||||
require 'schiphol'
|
require 'schiphol'
|
||||||
|
|
||||||
# Address of the server with the files.
|
# Address of the server with the files.
|
||||||
Server = 'www.louismullie.com'
|
Server = 's3.amazonaws.com/static-public-assets'
|
||||||
|
|
||||||
# Filenames for the Stanford packages.
|
# Filenames for the Stanford packages.
|
||||||
StanfordPackages = {
|
StanfordPackages = {
|
||||||
|
@ -20,34 +20,34 @@ module Treat::Core::Installer
|
||||||
:bin => File.absolute_path(Treat.paths.bin),
|
:bin => File.absolute_path(Treat.paths.bin),
|
||||||
:models => File.absolute_path(Treat.paths.models)
|
:models => File.absolute_path(Treat.paths.models)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Install required dependencies and optional
|
# Install required dependencies and optional
|
||||||
# dependencies for a specific language.
|
# dependencies for a specific language.
|
||||||
def self.install(language = 'english')
|
def self.install(language = 'english')
|
||||||
|
|
||||||
# Require the Rubygem dependency installer.
|
# Require the Rubygem dependency installer.
|
||||||
silence_warnings do
|
silence_warnings do
|
||||||
require 'rubygems/dependency_installer'
|
require 'rubygems/dependency_installer'
|
||||||
end
|
end
|
||||||
|
|
||||||
@@installer = Gem::DependencyInstaller.new
|
@@installer = Gem::DependencyInstaller.new
|
||||||
|
|
||||||
if language == 'travis'
|
if language == 'travis'
|
||||||
install_travis; return
|
install_travis; return
|
||||||
end
|
end
|
||||||
|
|
||||||
l = "#{language.to_s.capitalize} language"
|
l = "#{language.to_s.capitalize} language"
|
||||||
|
|
||||||
puts "\nTreat Installer, v. #{Treat::VERSION.to_s}\n\n"
|
puts "\nTreat Installer, v. #{Treat::VERSION.to_s}\n\n"
|
||||||
|
|
||||||
begin
|
begin
|
||||||
|
|
||||||
title "Installing core dependencies."
|
title "Installing core dependencies."
|
||||||
install_language_dependencies('agnostic')
|
install_language_dependencies('agnostic')
|
||||||
|
|
||||||
title "Installing dependencies for the #{l}.\n"
|
title "Installing dependencies for the #{l}.\n"
|
||||||
install_language_dependencies(language)
|
install_language_dependencies(language)
|
||||||
|
|
||||||
# If gem is installed only, download models.
|
# If gem is installed only, download models.
|
||||||
begin
|
begin
|
||||||
Gem::Specification.find_by_name('punkt-segmenter')
|
Gem::Specification.find_by_name('punkt-segmenter')
|
||||||
|
@ -73,7 +73,7 @@ module Treat::Core::Installer
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Minimal install for Travis CI.
|
# Minimal install for Travis CI.
|
||||||
def self.install_travis
|
def self.install_travis
|
||||||
install_language_dependencies(:agnostic)
|
install_language_dependencies(:agnostic)
|
||||||
|
@ -81,7 +81,7 @@ module Treat::Core::Installer
|
||||||
download_stanford(:minimal)
|
download_stanford(:minimal)
|
||||||
download_punkt_models(:english)
|
download_punkt_models(:english)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
def self.install_language_dependencies(language)
|
def self.install_language_dependencies(language)
|
||||||
dependencies = Treat.languages[language].dependencies
|
dependencies = Treat.languages[language].dependencies
|
||||||
|
@ -92,31 +92,31 @@ module Treat::Core::Installer
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.download_stanford(package = :minimal)
|
def self.download_stanford(package = :minimal)
|
||||||
|
|
||||||
f = StanfordPackages[package]
|
f = StanfordPackages[package]
|
||||||
url = "http://#{Server}/treat/#{f}"
|
url = "http://#{Server}/treat/#{f}"
|
||||||
loc = Schiphol.download(url,
|
loc = Schiphol.download(url,
|
||||||
download_folder: Treat.paths.tmp
|
download_folder: Treat.paths.tmp
|
||||||
)
|
)
|
||||||
puts "- Unzipping package ..."
|
puts "- Unzipping package ..."
|
||||||
dest = File.join(Treat.paths.tmp, 'stanford')
|
dest = File.join(Treat.paths.tmp, 'stanford')
|
||||||
unzip_stanford(loc, dest)
|
unzip_stanford(loc, dest)
|
||||||
|
|
||||||
model_dir = File.join(Paths[:models], 'stanford')
|
model_dir = File.join(Paths[:models], 'stanford')
|
||||||
bin_dir = File.join(Paths[:bin], 'stanford')
|
bin_dir = File.join(Paths[:bin], 'stanford')
|
||||||
origin = File.join(Paths[:tmp], 'stanford')
|
origin = File.join(Paths[:tmp], 'stanford')
|
||||||
|
|
||||||
# Mac hidden files fix.
|
# Mac hidden files fix.
|
||||||
mac_remove = File.join(dest, '__MACOSX')
|
mac_remove = File.join(dest, '__MACOSX')
|
||||||
if File.readable?(mac_remove)
|
if File.readable?(mac_remove)
|
||||||
FileUtils.rm_rf(mac_remove)
|
FileUtils.rm_rf(mac_remove)
|
||||||
end
|
end
|
||||||
|
|
||||||
unless File.readable?(bin_dir)
|
unless File.readable?(bin_dir)
|
||||||
puts "- Creating directory bin/stanford ..."
|
puts "- Creating directory bin/stanford ..."
|
||||||
FileUtils.mkdir_p(bin_dir)
|
FileUtils.mkdir_p(bin_dir)
|
||||||
end
|
end
|
||||||
|
|
||||||
unless File.readable?(model_dir)
|
unless File.readable?(model_dir)
|
||||||
puts "- Creating directory models/stanford ..."
|
puts "- Creating directory models/stanford ..."
|
||||||
FileUtils.mkdir_p(model_dir)
|
FileUtils.mkdir_p(model_dir)
|
||||||
|
@ -127,18 +127,18 @@ module Treat::Core::Installer
|
||||||
Dir.glob(File.join(origin, '*')) do |f|
|
Dir.glob(File.join(origin, '*')) do |f|
|
||||||
next if ['.', '..'].include?(f)
|
next if ['.', '..'].include?(f)
|
||||||
if f.index('jar')
|
if f.index('jar')
|
||||||
FileUtils.cp(f, File.join(Paths[:bin],
|
FileUtils.cp(f, File.join(Paths[:bin],
|
||||||
'stanford', File.basename(f)))
|
'stanford', File.basename(f)))
|
||||||
elsif FileTest.directory?(f)
|
elsif FileTest.directory?(f)
|
||||||
FileUtils.cp_r(f, model_dir)
|
FileUtils.cp_r(f, model_dir)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
puts "- Cleaning up..."
|
puts "- Cleaning up..."
|
||||||
FileUtils.rm_rf(origin)
|
FileUtils.rm_rf(origin)
|
||||||
|
|
||||||
'Done.'
|
'Done.'
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.download_punkt_models(language)
|
def self.download_punkt_models(language)
|
||||||
|
@ -146,7 +146,7 @@ module Treat::Core::Installer
|
||||||
f = "#{language}.yaml"
|
f = "#{language}.yaml"
|
||||||
dest = "#{Treat.paths.models}punkt/"
|
dest = "#{Treat.paths.models}punkt/"
|
||||||
url = "http://#{Server}/treat/punkt/#{f}"
|
url = "http://#{Server}/treat/punkt/#{f}"
|
||||||
loc = Schiphol.download(url,
|
loc = Schiphol.download(url,
|
||||||
download_folder: Treat.paths.tmp
|
download_folder: Treat.paths.tmp
|
||||||
)
|
)
|
||||||
unless File.readable?(dest)
|
unless File.readable?(dest)
|
||||||
|
@ -156,7 +156,7 @@ module Treat::Core::Installer
|
||||||
|
|
||||||
puts "- Copying model file to models/punkt ..."
|
puts "- Copying model file to models/punkt ..."
|
||||||
FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
|
FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
|
||||||
|
|
||||||
puts "- Cleaning up..."
|
puts "- Cleaning up..."
|
||||||
FileUtils.rm_rf(Paths[:tmp] + Server)
|
FileUtils.rm_rf(Paths[:tmp] + Server)
|
||||||
|
|
||||||
|
@ -181,12 +181,11 @@ module Treat::Core::Installer
|
||||||
begin
|
begin
|
||||||
puts "Installing #{dependency}...\n"
|
puts "Installing #{dependency}...\n"
|
||||||
@@installer.install(dependency)
|
@@installer.install(dependency)
|
||||||
rescue Exception => error
|
rescue Gem::InstallError => error
|
||||||
raise
|
puts "Warning: couldn't install " +
|
||||||
puts "Couldn't install gem '#{dependency}' " +
|
"gem '#{dependency}' (#{error.message})."
|
||||||
"(#{error.message})."
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Unzip a file to the destination path.
|
# Unzip a file to the destination path.
|
||||||
|
@ -194,7 +193,7 @@ module Treat::Core::Installer
|
||||||
|
|
||||||
require 'zip/zip'
|
require 'zip/zip'
|
||||||
f_path = ''
|
f_path = ''
|
||||||
|
|
||||||
Zip::ZipFile.open(file) do |zip_file|
|
Zip::ZipFile.open(file) do |zip_file|
|
||||||
zip_file.each do |f|
|
zip_file.each do |f|
|
||||||
f_path = File.join(destination, f.name)
|
f_path = File.join(destination, f.name)
|
||||||
|
|
|
@ -3,6 +3,7 @@ class Treat::Core::Server
|
||||||
# Refer to http://rack.rubyforge.org/doc/classes/Rack/Server.html
|
# Refer to http://rack.rubyforge.org/doc/classes/Rack/Server.html
|
||||||
# for possible options to configure.
|
# for possible options to configure.
|
||||||
def initialize(handler = 'thin', options = {})
|
def initialize(handler = 'thin', options = {})
|
||||||
|
raise "Implementation not finished."
|
||||||
require 'json'; require 'rack'
|
require 'json'; require 'rack'
|
||||||
@handler, @options = handler.capitalize, options
|
@handler, @options = handler.capitalize, options
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,6 +4,7 @@ module Treat::Entities
|
||||||
|
|
||||||
# Represents a collection.
|
# Represents a collection.
|
||||||
class Collection < Entity; end
|
class Collection < Entity; end
|
||||||
|
|
||||||
# Represents a document.
|
# Represents a document.
|
||||||
class Document < Entity; end
|
class Document < Entity; end
|
||||||
|
|
||||||
|
@ -18,6 +19,9 @@ module Treat::Entities
|
||||||
# Represents a block of text
|
# Represents a block of text
|
||||||
class Block < Section; end
|
class Block < Section; end
|
||||||
|
|
||||||
|
# Represents a list.
|
||||||
|
class List < Section; end
|
||||||
|
|
||||||
# * Zones and related classes * #
|
# * Zones and related classes * #
|
||||||
|
|
||||||
# Represents a zone of text.
|
# Represents a zone of text.
|
||||||
|
@ -31,9 +35,6 @@ module Treat::Entities
|
||||||
# of sentences and/or phrases).
|
# of sentences and/or phrases).
|
||||||
class Paragraph < Zone; end
|
class Paragraph < Zone; end
|
||||||
|
|
||||||
# Represents a list.
|
|
||||||
class List < Zone; end
|
|
||||||
|
|
||||||
# * Groups and related classes * #
|
# * Groups and related classes * #
|
||||||
|
|
||||||
# Represents a group of tokens.
|
# Represents a group of tokens.
|
||||||
|
|
|
@ -22,7 +22,9 @@ module Treat::Entities
|
||||||
attr_accessor :type
|
attr_accessor :type
|
||||||
|
|
||||||
# Autoload all the classes in /abilities.
|
# Autoload all the classes in /abilities.
|
||||||
include Treat::Autoload
|
path = File.expand_path(__FILE__)
|
||||||
|
patt = File.dirname(path) + '/entity/*.rb'
|
||||||
|
Dir.glob(patt).each { |f| require f }
|
||||||
|
|
||||||
# Implements support for #register, #registry.
|
# Implements support for #register, #registry.
|
||||||
include Registrable
|
include Registrable
|
||||||
|
@ -82,8 +84,11 @@ module Treat::Entities
|
||||||
#
|
#
|
||||||
# Takes in a single entity or an array of
|
# Takes in a single entity or an array of
|
||||||
# entities. Returns the first child supplied.
|
# entities. Returns the first child supplied.
|
||||||
# @see Treat::Registrable
|
# If a string is
|
||||||
def <<(entities, clear_parent = true)
|
def <<(entities, clear_parent = true)
|
||||||
|
entities = (entities.is_a?(::String) ||
|
||||||
|
entities.is_a?(::Numeric)) ?
|
||||||
|
entities.to_entity : entities
|
||||||
entities = entities.is_a?(::Array) ?
|
entities = entities.is_a?(::Array) ?
|
||||||
entities : [entities]
|
entities : [entities]
|
||||||
# Register each entity in this node.
|
# Register each entity in this node.
|
||||||
|
@ -121,7 +126,7 @@ module Treat::Entities
|
||||||
# requested method does not exist. Also
|
# requested method does not exist. Also
|
||||||
# provides suggestions for misspellings.
|
# provides suggestions for misspellings.
|
||||||
def invalid_call(sym)
|
def invalid_call(sym)
|
||||||
msg = Treat::Workers::Category.lookup(sym) ?
|
msg = Treat::Workers.lookup(sym) ?
|
||||||
"Method #{sym} can't be called on a #{type}." :
|
"Method #{sym} can't be called on a #{type}." :
|
||||||
"Method #{sym} is not defined by Treat." +
|
"Method #{sym} is not defined by Treat." +
|
||||||
Treat::Helpers::Help.did_you_mean?(
|
Treat::Helpers::Help.did_you_mean?(
|
||||||
|
|
|
@ -57,7 +57,7 @@ module Treat::Entities::Entity::Applicable
|
||||||
|
|
||||||
# Get the group of a task.
|
# Get the group of a task.
|
||||||
def get_group(task)
|
def get_group(task)
|
||||||
g = Treat::Workers::Category.lookup(task)
|
g = Treat::Workers.lookup(task)
|
||||||
unless g
|
unless g
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Task #{task} does not exist."
|
"Task #{task} does not exist."
|
||||||
|
|
|
@ -15,7 +15,21 @@ module Treat::Entities::Entity::Buildable
|
||||||
PunctRegexp = /^[[:punct:]\$]+$/
|
PunctRegexp = /^[[:punct:]\$]+$/
|
||||||
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
||||||
EmailRegexp = /.+\@.+\..+/
|
EmailRegexp = /.+\@.+\..+/
|
||||||
Enclitics = %w['ll 'm 're 's 't 've]
|
Enclitics = [
|
||||||
|
# EXAMPLE:
|
||||||
|
"'d", # I'd => I would
|
||||||
|
"'ll", # I'll => I will
|
||||||
|
"'m", # I'm => I am
|
||||||
|
"'re", # We're => We are
|
||||||
|
"'s", # There's => There is
|
||||||
|
# Let's => Let us
|
||||||
|
"'t", # 'Twas => Archaic ('Twas the night)
|
||||||
|
"'ve", # They've => They have
|
||||||
|
"n't" # Can't => Can not
|
||||||
|
]
|
||||||
|
|
||||||
|
# Accepted formats of serialized files
|
||||||
|
AcceptedFormats = ['.xml', '.yml', '.yaml', '.mongo']
|
||||||
|
|
||||||
# Reserved folder names
|
# Reserved folder names
|
||||||
Reserved = ['.index']
|
Reserved = ['.index']
|
||||||
|
@ -23,23 +37,38 @@ module Treat::Entities::Entity::Buildable
|
||||||
# Build an entity from anything (can be
|
# Build an entity from anything (can be
|
||||||
# a string, numeric,folder, or file name
|
# a string, numeric,folder, or file name
|
||||||
# representing a raw or serialized file).
|
# representing a raw or serialized file).
|
||||||
def build(file_or_value, options = {})
|
def build(*args)
|
||||||
|
|
||||||
|
# This probably needs some doc.
|
||||||
|
if args.size == 0
|
||||||
|
file_or_value = ''
|
||||||
|
elsif args[0].is_a?(Hash)
|
||||||
|
file_or_value = args[0]
|
||||||
|
elsif args.size == 1
|
||||||
|
if args[0].is_a?(Treat::Entities::Entity)
|
||||||
|
args[0] = [args[0]]
|
||||||
|
end
|
||||||
|
file_or_value = args[0]
|
||||||
|
else
|
||||||
|
file_or_value = args
|
||||||
|
end
|
||||||
|
|
||||||
fv = file_or_value.to_s
|
fv = file_or_value.to_s
|
||||||
|
|
||||||
if file_or_value.is_a?(Hash)
|
if fv == ''; self.new
|
||||||
|
elsif file_or_value.is_a?(Array)
|
||||||
|
from_array(file_or_value)
|
||||||
|
elsif file_or_value.is_a?(Hash)
|
||||||
from_db(file_or_value)
|
from_db(file_or_value)
|
||||||
elsif self == Treat::Entities::Document ||
|
elsif self == Treat::Entities::Document || (is_serialized_file?(fv))
|
||||||
(fv.index('yml') || fv.index('yaml') ||
|
|
||||||
fv.index('xml') || fv.index('mongo'))
|
|
||||||
if fv =~ UriRegexp
|
if fv =~ UriRegexp
|
||||||
from_url(fv, options)
|
from_url(fv)
|
||||||
else
|
else
|
||||||
from_file(fv, options)
|
from_file(fv)
|
||||||
end
|
end
|
||||||
elsif self == Treat::Entities::Collection
|
elsif self == Treat::Entities::Collection
|
||||||
if FileTest.directory?(fv)
|
if FileTest.directory?(fv)
|
||||||
from_folder(fv, options)
|
from_folder(fv)
|
||||||
else
|
else
|
||||||
create_collection(fv)
|
create_collection(fv)
|
||||||
end
|
end
|
||||||
|
@ -78,8 +107,19 @@ module Treat::Entities::Entity::Buildable
|
||||||
e
|
e
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Build a document from an array
|
||||||
|
# of builders.
|
||||||
|
def from_array(array)
|
||||||
|
obj = self.new
|
||||||
|
array.each do |el|
|
||||||
|
el = el.to_entity unless el.is_a?(Treat::Entities::Entity)
|
||||||
|
obj << el
|
||||||
|
end
|
||||||
|
obj
|
||||||
|
end
|
||||||
|
|
||||||
# Build a document from an URL.
|
# Build a document from an URL.
|
||||||
def from_url(url, options)
|
def from_url(url)
|
||||||
unless self ==
|
unless self ==
|
||||||
Treat::Entities::Document
|
Treat::Entities::Document
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
|
@ -88,8 +128,12 @@ module Treat::Entities::Entity::Buildable
|
||||||
end
|
end
|
||||||
|
|
||||||
begin
|
begin
|
||||||
|
folder = Treat.paths.files
|
||||||
|
if folder[-1] == '/'
|
||||||
|
folder = folder[0..-2]
|
||||||
|
end
|
||||||
f = Schiphol.download(url,
|
f = Schiphol.download(url,
|
||||||
download_folder: Treat.paths.files,
|
download_folder: folder,
|
||||||
show_progress: !Treat.core.verbosity.silence,
|
show_progress: !Treat.core.verbosity.silence,
|
||||||
rectify_extensions: true,
|
rectify_extensions: true,
|
||||||
max_tries: 3)
|
max_tries: 3)
|
||||||
|
@ -97,10 +141,8 @@ module Treat::Entities::Entity::Buildable
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Couldn't download file at #{url}."
|
"Couldn't download file at #{url}."
|
||||||
end
|
end
|
||||||
|
|
||||||
options[:default_to] ||= 'html'
|
|
||||||
|
|
||||||
e = from_file(f, options)
|
e = from_file(f,'html')
|
||||||
e.set :url, url.to_s
|
e.set :url, url.to_s
|
||||||
e
|
e
|
||||||
|
|
||||||
|
@ -123,7 +165,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
|
|
||||||
# Build an entity from a folder with documents.
|
# Build an entity from a folder with documents.
|
||||||
# Folders will be searched recursively.
|
# Folders will be searched recursively.
|
||||||
def from_folder(folder, options)
|
def from_folder(folder)
|
||||||
|
|
||||||
return if Reserved.include?(folder)
|
return if Reserved.include?(folder)
|
||||||
|
|
||||||
|
@ -148,49 +190,44 @@ module Treat::Entities::Entity::Buildable
|
||||||
|
|
||||||
c = Treat::Entities::Collection.new(folder)
|
c = Treat::Entities::Collection.new(folder)
|
||||||
folder += '/' unless folder[-1] == '/'
|
folder += '/' unless folder[-1] == '/'
|
||||||
|
|
||||||
if !FileTest.directory?(folder)
|
if !FileTest.directory?(folder)
|
||||||
FileUtils.mkdir(folder)
|
FileUtils.mkdir(folder)
|
||||||
end
|
end
|
||||||
|
|
||||||
c.set :folder, folder
|
c.set :folder, folder
|
||||||
i = folder + '/.index'
|
i = folder + '/.index'
|
||||||
c.set :index, i if FileTest.directory?(i)
|
c.set :index, i if FileTest.directory?(i)
|
||||||
|
|
||||||
Dir[folder + '*'].each do |f|
|
Dir[folder + '*'].each do |f|
|
||||||
if FileTest.directory?(f)
|
if FileTest.directory?(f)
|
||||||
c2 = Treat::Entities::Collection.
|
c2 = Treat::Entities::Collection.
|
||||||
from_folder(f, options)
|
from_folder(f)
|
||||||
c.<<(c2, false) if c2
|
c.<<(c2, false) if c2
|
||||||
else
|
else
|
||||||
c.<<(Treat::Entities::Document.
|
c.<<(Treat::Entities::Document.
|
||||||
from_file(f, options), false)
|
from_file(f), false)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
c
|
return c
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build a document from a raw or serialized file.
|
# Build a document from a raw or serialized file.
|
||||||
def from_file(file, options)
|
def from_file(file,def_fmt=nil)
|
||||||
|
|
||||||
if file.index('yml') ||
|
if is_serialized_file?(file)
|
||||||
file.index('yaml') ||
|
from_serialized_file(file)
|
||||||
file.index('xml') ||
|
|
||||||
file.index('mongo')
|
|
||||||
from_serialized_file(file, options)
|
|
||||||
else
|
else
|
||||||
fmt = Treat::Workers::Formatters::Readers::Autoselect.
|
fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
|
||||||
detect_format(file, options[:default_to])
|
from_raw_file(file, fmt)
|
||||||
options[:_format] = fmt
|
|
||||||
from_raw_file(file, options)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build a document from a raw file.
|
# Build a document from a raw file.
|
||||||
def from_raw_file(file, options)
|
def from_raw_file(file, def_fmt='txt')
|
||||||
|
|
||||||
unless self ==
|
unless self ==
|
||||||
Treat::Entities::Document
|
Treat::Entities::Document
|
||||||
|
@ -204,7 +241,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
"Path '#{file}' does not "+
|
"Path '#{file}' does not "+
|
||||||
"point to a readable file."
|
"point to a readable file."
|
||||||
end
|
end
|
||||||
|
options = {default_format: def_fmt}
|
||||||
d = Treat::Entities::Document.new
|
d = Treat::Entities::Document.new
|
||||||
d.set :file, file
|
d.set :file, file
|
||||||
d.read(:autoselect, options)
|
d.read(:autoselect, options)
|
||||||
|
@ -212,34 +249,32 @@ module Treat::Entities::Entity::Buildable
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build an entity from a serialized file.
|
# Build an entity from a serialized file.
|
||||||
def from_serialized_file(file, options)
|
def from_serialized_file(file)
|
||||||
|
|
||||||
if file.index('mongo')
|
unless File.readable?(file)
|
||||||
options[:id] = file.scan( # Consolidate this
|
raise Treat::Exception,
|
||||||
/([0-9]+)\.mongo/).first.first
|
"Path '#{file}' does not "+
|
||||||
from_db(:mongo, options)
|
"point to a readable file."
|
||||||
else
|
|
||||||
unless File.readable?(file)
|
|
||||||
raise Treat::Exception,
|
|
||||||
"Path '#{file}' does not "+
|
|
||||||
"point to a readable file."
|
|
||||||
end
|
|
||||||
doc = Treat::Entities::Document.new
|
|
||||||
doc.set :file, file
|
|
||||||
format = nil
|
|
||||||
if file.index('yml') || file.index('yaml')
|
|
||||||
format = :yaml
|
|
||||||
elsif file.index('xml')
|
|
||||||
f = :xml
|
|
||||||
else
|
|
||||||
raise Treat::Exception,
|
|
||||||
"Unreadable serialized format for #{file}."
|
|
||||||
end
|
|
||||||
doc.unserialize(format, options)
|
|
||||||
doc.children[0].set_as_root! # Fix this
|
|
||||||
doc.children[0]
|
|
||||||
end
|
end
|
||||||
|
doc = Treat::Entities::Document.new
|
||||||
|
doc.set :file, file
|
||||||
|
format = nil
|
||||||
|
if File.extname(file) == '.yml' ||
|
||||||
|
File.extname(file) == '.yaml'
|
||||||
|
format = :yaml
|
||||||
|
elsif File.extname(file) == '.xml'
|
||||||
|
format = :xml
|
||||||
|
else
|
||||||
|
raise Treat::Exception,
|
||||||
|
"Unreadable serialized format for #{file}."
|
||||||
|
end
|
||||||
|
doc.unserialize(format)
|
||||||
|
doc.children[0].set_as_root! # Fix this
|
||||||
|
doc.children[0]
|
||||||
|
end
|
||||||
|
|
||||||
|
def is_serialized_file?(path_to_check)
|
||||||
|
(AcceptedFormats.include? File.extname(path_to_check)) && (File.file?(path_to_check))
|
||||||
end
|
end
|
||||||
|
|
||||||
def from_db(hash)
|
def from_db(hash)
|
||||||
|
@ -258,9 +293,23 @@ module Treat::Entities::Entity::Buildable
|
||||||
# Build any kind of entity from a string.
|
# Build any kind of entity from a string.
|
||||||
def anything_from_string(string)
|
def anything_from_string(string)
|
||||||
case self.mn.downcase.intern
|
case self.mn.downcase.intern
|
||||||
when :document, :collection
|
when :document
|
||||||
|
folder = Treat.paths.files
|
||||||
|
if folder[-1] == '/'
|
||||||
|
folder = folder[0..-2]
|
||||||
|
end
|
||||||
|
|
||||||
|
now = Time.now.to_f
|
||||||
|
doc_file = folder+ "/#{now}.txt"
|
||||||
|
string.force_encoding('UTF-8')
|
||||||
|
File.open(doc_file, 'w') do |f|
|
||||||
|
f.puts string
|
||||||
|
end
|
||||||
|
|
||||||
|
from_raw_file(doc_file)
|
||||||
|
when :collection
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Cannot create a document or " +
|
"Cannot create a " +
|
||||||
"collection from a string " +
|
"collection from a string " +
|
||||||
"(need a readable file/folder)."
|
"(need a readable file/folder)."
|
||||||
when :phrase
|
when :phrase
|
||||||
|
@ -287,6 +336,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# This should be improved on.
|
||||||
def check_encoding(string)
|
def check_encoding(string)
|
||||||
string.encode("UTF-8", undef: :replace) # Fix
|
string.encode("UTF-8", undef: :replace) # Fix
|
||||||
end
|
end
|
||||||
|
@ -346,7 +396,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def create_collection(fv)
|
def create_collection(fv)
|
||||||
FileUtils.mkdir(fv)
|
FileUtils.mkdir(fv)
|
||||||
Treat::Entities::Collection.new(fv)
|
Treat::Entities::Collection.new(fv)
|
||||||
|
|
|
@ -11,8 +11,8 @@ module Treat::Entities::Entity::Checkable
|
||||||
return @features[feature] if has?(feature)
|
return @features[feature] if has?(feature)
|
||||||
return send(feature) if do_it
|
return send(feature) if do_it
|
||||||
task = caller_method(2) # This is dangerous !
|
task = caller_method(2) # This is dangerous !
|
||||||
g1 = Treat::Workers::Category.lookup(task)
|
g1 = Treat::Workers.lookup(task)
|
||||||
g2 = Treat::Workers::Category.lookup(feature)
|
g2 = Treat::Workers.lookup(feature)
|
||||||
|
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"#{g1.type.to_s.capitalize} " +
|
"#{g1.type.to_s.capitalize} " +
|
||||||
|
|
|
@ -41,6 +41,7 @@ module Treat::Entities::Entity::Countable
|
||||||
# Returns the frequency of the given value
|
# Returns the frequency of the given value
|
||||||
# in the this entity.
|
# in the this entity.
|
||||||
def frequency_of(value)
|
def frequency_of(value)
|
||||||
|
value = value.downcase
|
||||||
if is_a?(Treat::Entities::Token)
|
if is_a?(Treat::Entities::Token)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Cannot get the frequency " +
|
"Cannot get the frequency " +
|
||||||
|
|
|
@ -3,67 +3,64 @@
|
||||||
# printed by the #print_debug function.
|
# printed by the #print_debug function.
|
||||||
module Treat::Entities::Entity::Debuggable
|
module Treat::Entities::Entity::Debuggable
|
||||||
|
|
||||||
@@prev = nil
|
# Previous state and counter.
|
||||||
@@i = 0
|
@@prev, @@i = nil, 0
|
||||||
|
|
||||||
# Explains what Treat is currently doing.
|
# Explains what Treat is currently doing.
|
||||||
|
# Fixme: last call will never get shown.
|
||||||
def print_debug(entity, task, worker, group, options)
|
def print_debug(entity, task, worker, group, options)
|
||||||
|
# Get a list of the worker's targets.
|
||||||
targs = group.targets.map do |target|
|
targets = group.targets.map(&:to_s)
|
||||||
target.to_s
|
|
||||||
end
|
|
||||||
|
|
||||||
if targs.size == 1
|
# List the worker's targets as either
|
||||||
t = targs[0]
|
# a single target or an and/or form
|
||||||
else
|
# (since it would be too costly to
|
||||||
t = targs[0..-2].join(', ') +
|
# actually determine what target types
|
||||||
' and/or ' + targs[-1]
|
# were processed at runtime for each call).
|
||||||
end
|
t = targets.size == 1 ? targets[0] : targets[
|
||||||
|
0..-2].join(', ') + ' and/or ' + targets[-1]
|
||||||
|
|
||||||
|
# Add genitive for annotations (sing./plural)
|
||||||
|
genitive = targets.size > 1 ? 'their' : 'its'
|
||||||
|
|
||||||
|
# Set up an empty string and humanize task name.
|
||||||
|
doing, human_task = '', task.to_s.gsub('_', ' ')
|
||||||
|
|
||||||
genitive = targs.size > 1 ?
|
# Base is "{task}-ed {a(n)|N} {target(s)}"
|
||||||
'their' : 'its'
|
if [:transformer, :computer].include?(group.type)
|
||||||
|
|
||||||
doing = ''
|
|
||||||
|
|
||||||
human_task = task.to_s.gsub('_', ' ')
|
|
||||||
|
|
||||||
if group.type == :transformer ||
|
|
||||||
group.type == :computer
|
|
||||||
|
|
||||||
tt = human_task
|
tt = human_task
|
||||||
tt = tt[0..-2] if tt[-1] == 'e'
|
tt = tt[0..-2] if tt[-1] == 'e'
|
||||||
ed = tt[-1] == 'd' ? '' : 'ed'
|
ed = tt[-1] == 'd' ? '' : 'ed'
|
||||||
doing = "#{tt.capitalize}#{ed} #{t}"
|
doing = "#{tt.capitalize}#{ed} #{t}"
|
||||||
|
# Base is "Annotated {a(n)|N} {target(s)}"
|
||||||
elsif group.type == :annotator
|
elsif group.type == :annotator
|
||||||
|
|
||||||
if group.preset_option
|
if group.preset_option
|
||||||
opt = options[group.preset_option]
|
opt = options[group.preset_option]
|
||||||
form = opt.to_s.gsub('_', ' ')
|
form = opt.to_s.gsub('_', ' ')
|
||||||
human_task[-1] = ''
|
human_task[-1] = ''
|
||||||
human_task = form + ' ' + human_task
|
human_task = form + ' ' + human_task
|
||||||
end
|
end
|
||||||
|
|
||||||
doing = "Annotated #{t} with " +
|
doing = "Annotated #{t} with " +
|
||||||
"#{genitive} #{human_task}"
|
"#{genitive} #{human_task}"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Form is '{base} in format {worker}'.
|
||||||
if group.to_s.index('Formatters')
|
if group.to_s.index('Formatters')
|
||||||
curr = doing +
|
curr = doing + ' in format ' + worker.to_s
|
||||||
' in format ' +
|
# Form is '{base} using {worker}'.
|
||||||
worker.to_s
|
|
||||||
else
|
else
|
||||||
curr = doing +
|
curr = doing + ' using ' + worker.to_s.gsub('_', ' ')
|
||||||
' using ' +
|
|
||||||
worker.to_s.gsub('_', ' ')
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Remove any double pluralization that may happen.
|
||||||
curr.gsub!('ss', 's') unless curr.index('class')
|
curr.gsub!('ss', 's') unless curr.index('class')
|
||||||
curr += '.'
|
|
||||||
|
|
||||||
if curr == @@prev
|
# Accumulate repeated tasks.
|
||||||
@@i += 1
|
@@i += 1 if curr == @@prev
|
||||||
else
|
|
||||||
|
# Change tasks, so output.
|
||||||
|
if curr != @@prev && @@prev
|
||||||
|
# Pluralize entity names if necessary.
|
||||||
if @@i > 1
|
if @@i > 1
|
||||||
Treat.core.entities.list.each do |e|
|
Treat.core.entities.list.each do |e|
|
||||||
@@prev.gsub!(e.to_s, e.to_s + 's')
|
@@prev.gsub!(e.to_s, e.to_s + 's')
|
||||||
|
@ -71,9 +68,15 @@ module Treat::Entities::Entity::Debuggable
|
||||||
@@prev.gsub!('its', 'their')
|
@@prev.gsub!('its', 'their')
|
||||||
@@prev = @@prev.split(' ').
|
@@prev = @@prev.split(' ').
|
||||||
insert(1, @@i.to_s).join(' ')
|
insert(1, @@i.to_s).join(' ')
|
||||||
|
# Add determiner if singular.
|
||||||
|
else
|
||||||
|
@@prev = @@prev.split(' ').
|
||||||
|
insert(1, 'a').join(' ')
|
||||||
end
|
end
|
||||||
|
# Reset counter.
|
||||||
@@i = 0
|
@@i = 0
|
||||||
puts @@prev # Last call doesn't get shown.
|
# Write to stdout.
|
||||||
|
puts @@prev + '.'
|
||||||
end
|
end
|
||||||
|
|
||||||
@@prev = curr
|
@@prev = curr
|
||||||
|
|
|
@ -88,7 +88,6 @@ module Treat::Entities::Entity::Delegatable
|
||||||
# Get the default worker for that language
|
# Get the default worker for that language
|
||||||
# inside the given group.
|
# inside the given group.
|
||||||
def find_worker_for_language(language, group)
|
def find_worker_for_language(language, group)
|
||||||
|
|
||||||
lang = Treat.languages[language]
|
lang = Treat.languages[language]
|
||||||
cat = group.to_s.split('::')[2].downcase.intern
|
cat = group.to_s.split('::')[2].downcase.intern
|
||||||
group = group.mn.ucc.intern
|
group = group.mn.ucc.intern
|
||||||
|
@ -96,31 +95,25 @@ module Treat::Entities::Entity::Delegatable
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"No configuration file loaded for language #{language}."
|
"No configuration file loaded for language #{language}."
|
||||||
end
|
end
|
||||||
|
|
||||||
workers = lang.workers
|
workers = lang.workers
|
||||||
|
|
||||||
if !workers.respond_to?(cat) ||
|
if !workers.respond_to?(cat) ||
|
||||||
!workers[cat].respond_to?(group)
|
!workers[cat].respond_to?(group)
|
||||||
workers = Treat.languages.agnostic.workers
|
workers = Treat.languages.agnostic.workers
|
||||||
end
|
end
|
||||||
|
|
||||||
if !workers.respond_to?(cat) ||
|
if !workers.respond_to?(cat) ||
|
||||||
!workers[cat].respond_to?(group)
|
!workers[cat].respond_to?(group)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"No #{group} is/are available for the " +
|
"No #{group} is/are available for the " +
|
||||||
"#{language.to_s.capitalize} language."
|
"#{language.to_s.capitalize} language."
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
workers[cat][group].first
|
workers[cat][group].first
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Return an error message and suggest possible typos.
|
# Return an error message and suggest possible typos.
|
||||||
def worker_not_found(klass, group)
|
def worker_not_found(worker, group)
|
||||||
"Algorithm '#{klass.mn.ucc}' couldn't be "+
|
"Worker with name '#{worker}' couldn't be "+
|
||||||
"found in group #{group}." + Treat::Helpers::Help.
|
"found in group #{group}." + Treat::Helpers::Help.
|
||||||
did_you_mean?(group.list.map { |c| c.ucc }, klass.ucc)
|
did_you_mean?(group.list.map { |c| c.ucc }, worker)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -105,18 +105,6 @@ module Treat::Entities::Entity::Iterable
|
||||||
end
|
end
|
||||||
i
|
i
|
||||||
end
|
end
|
||||||
|
|
||||||
# Return the first element in the array, warning if not
|
|
||||||
# the only one in the array. Used for magic methods: e.g.,
|
|
||||||
# the magic method "word" if called on a sentence with many
|
|
||||||
# words, Treat will return the first word, but warn the user.
|
|
||||||
def first_but_warn(array, type)
|
|
||||||
if array.size > 1
|
|
||||||
warn "Warning: requested one #{type}, but" +
|
|
||||||
" there are many #{type}s in this entity."
|
|
||||||
end
|
|
||||||
array[0]
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
end
|
end
|
|
@ -78,5 +78,16 @@ module Treat::Entities::Entity::Magical
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Return the first element in the array, warning if not
|
||||||
|
# the only one in the array. Used for magic methods: e.g.,
|
||||||
|
# the magic method "word" if called on a sentence with many
|
||||||
|
# words, Treat will return the first word, but warn the user.
|
||||||
|
def first_but_warn(array, type)
|
||||||
|
if array.size > 1
|
||||||
|
warn "Warning: requested one #{type}, but" +
|
||||||
|
" there are many #{type}s in this entity."
|
||||||
|
end
|
||||||
|
array[0]
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -6,6 +6,12 @@ module Treat::Entities::Entity::Stringable
|
||||||
# Returns the entity's true string value.
|
# Returns the entity's true string value.
|
||||||
def to_string; @value.dup; end
|
def to_string; @value.dup; end
|
||||||
|
|
||||||
|
# Returns an array of the childrens' string
|
||||||
|
# values, found by calling #to_s on them.
|
||||||
|
def to_a; @children.map { |c| c.to_s }; end
|
||||||
|
|
||||||
|
alias :to_ary :to_a
|
||||||
|
|
||||||
# Returns the entity's string value by
|
# Returns the entity's string value by
|
||||||
# imploding the value of all terminal
|
# imploding the value of all terminal
|
||||||
# entities in the subtree of that entity.
|
# entities in the subtree of that entity.
|
||||||
|
@ -52,16 +58,14 @@ module Treat::Entities::Entity::Stringable
|
||||||
end
|
end
|
||||||
|
|
||||||
# Helper method to implode the string value of the subtree.
|
# Helper method to implode the string value of the subtree.
|
||||||
def implode
|
def implode(value = "")
|
||||||
|
|
||||||
return @value.dup if !has_children?
|
return @value.dup if !has_children?
|
||||||
|
|
||||||
value = ''
|
|
||||||
|
|
||||||
each do |child|
|
each do |child|
|
||||||
|
|
||||||
if child.is_a?(Treat::Entities::Section)
|
if child.is_a?(Treat::Entities::Section)
|
||||||
value += "\n\n"
|
value << "\n\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
||||||
|
@ -69,14 +73,14 @@ module Treat::Entities::Entity::Stringable
|
||||||
child.is_a?(Treat::Entities::Enclitic)
|
child.is_a?(Treat::Entities::Enclitic)
|
||||||
value.strip!
|
value.strip!
|
||||||
end
|
end
|
||||||
value += child.to_s + ' '
|
value << child.to_s + ' '
|
||||||
else
|
else
|
||||||
value += child.implode
|
child.implode(value)
|
||||||
end
|
end
|
||||||
|
|
||||||
if child.is_a?(Treat::Entities::Title) ||
|
if child.is_a?(Treat::Entities::Title) ||
|
||||||
child.is_a?(Treat::Entities::Paragraph)
|
child.is_a?(Treat::Entities::Paragraph)
|
||||||
value += "\n\n"
|
value << "\n\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,18 +1,29 @@
|
||||||
|
# Helper methods to manipulate hashes.
|
||||||
class Treat::Helpers::Hash
|
class Treat::Helpers::Hash
|
||||||
|
|
||||||
# Allow getting the caller method in any context.
|
# Mixin to allow conversion of hashes to
|
||||||
Hash.class_eval do
|
# nested structs with the keys as attributes.
|
||||||
|
module ToStruct
|
||||||
# Converts a hash to nested structs.
|
# Converts a hash to nested structs.
|
||||||
def self.hash_to_struct(hash)
|
def to_struct
|
||||||
return hash if hash.keys.
|
hash = self
|
||||||
select { |k| !k.is_a?(Symbol) }.size > 0
|
symbols = hash.keys.select { |k|
|
||||||
struct = Struct.new(*hash.keys).new(*hash.values)
|
!k.is_a?(Symbol) }.size
|
||||||
|
return hash if symbols > 0
|
||||||
|
klass = Struct.new(*hash.keys)
|
||||||
|
struct = klass.new(*hash.values)
|
||||||
hash.each do |key, value|
|
hash.each do |key, value|
|
||||||
if value.is_a?(Hash)
|
if value.is_a?(Hash)
|
||||||
struct[key] = self.hash_to_struct(value)
|
v = value.to_struct
|
||||||
|
struct[key] = v
|
||||||
end
|
end
|
||||||
end; return struct
|
end; return struct
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Include the mixins on the core Hash class.
|
||||||
|
Hash.class_eval do
|
||||||
|
include Treat::Helpers::Hash::ToStruct
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -32,30 +32,4 @@ class Treat::Helpers::Help
|
||||||
msg
|
msg
|
||||||
end
|
end
|
||||||
|
|
||||||
# Return the levensthein distance between
|
|
||||||
# two strings taking into account the costs
|
|
||||||
# of insertion, deletion, and substitution.
|
|
||||||
# Used by did_you_mean? to detect typos.
|
|
||||||
def self.levenshtein(first, other, ins=1, del=1, sub=1)
|
|
||||||
return nil if first.nil? || other.nil?
|
|
||||||
dm = []
|
|
||||||
dm[0] = (0..first.length).collect { |i| i * ins}
|
|
||||||
fill = [0] * (first.length - 1).abs
|
|
||||||
for i in 1..other.length
|
|
||||||
dm[i] = [i * del, fill.flatten]
|
|
||||||
end
|
|
||||||
for i in 1..other.length
|
|
||||||
for j in 1..first.length
|
|
||||||
dm[i][j] = [
|
|
||||||
dm[i-1][j-1] +
|
|
||||||
(first[i-1] ==
|
|
||||||
other[i-1] ? 0 : sub),
|
|
||||||
dm[i][j-1] + ins,
|
|
||||||
dm[i-1][j] + del
|
|
||||||
].min
|
|
||||||
end
|
|
||||||
end
|
|
||||||
dm[other.length][first.length]
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,46 +4,40 @@ class Treat::Helpers::Object
|
||||||
# Allow introspection onto what method called
|
# Allow introspection onto what method called
|
||||||
# another one at runtime (useful for debugging).
|
# another one at runtime (useful for debugging).
|
||||||
module CallerMethod
|
module CallerMethod
|
||||||
|
# Pattern to match method from trace.
|
||||||
|
CMPattern = /^(.+?):(\d+)(?::in `(.*)')?/
|
||||||
# Return the name of the method that
|
# Return the name of the method that
|
||||||
# called the method that calls this method.
|
# called the method that calls this method.
|
||||||
def caller_method(n = 3)
|
def caller_method(n = 3)
|
||||||
at = caller(n).first
|
at = caller(n).first
|
||||||
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
CMPattern =~ at
|
||||||
Regexp.last_match[3].
|
Regexp.last_match[3].
|
||||||
gsub('block in ', '').intern
|
gsub('block in ', '').intern
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Retrieve the last name of a class/module
|
# Retrieve the last name of a class/module
|
||||||
# (i.e. the part after the last "::").
|
# (i.e. the part after the last "::").
|
||||||
module ModuleName
|
module ModuleName
|
||||||
|
|
||||||
def module_name; self.to_s.split('::')[-1]; end
|
def module_name; self.to_s.split('::')[-1]; end
|
||||||
alias :mn :module_name
|
alias :mn :module_name
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
module Verbosity
|
module Verbosity
|
||||||
# Runs a block of code without warnings.
|
# Runs a block of code without warnings.
|
||||||
def silence_warnings(&block)
|
def silence_warnings(&block)
|
||||||
warn_level = $VERBOSE
|
warn_level = $VERBOSE; $VERBOSE = nil
|
||||||
$VERBOSE = nil
|
result = block.call; $VERBOSE = warn_level
|
||||||
result = block.call
|
|
||||||
$VERBOSE = warn_level
|
|
||||||
result
|
result
|
||||||
end
|
end
|
||||||
|
|
||||||
# Runs a block of code while blocking stdout.
|
# Runs a block of code while blocking stdout.
|
||||||
def silence_stdout(log = '/dev/null')
|
def silence_stdout(log = '/dev/null')
|
||||||
unless Treat.core.verbosity.silence
|
unless Treat.core.verbosity.silence
|
||||||
yield; return
|
yield; return
|
||||||
end
|
end
|
||||||
old = $stdout.dup
|
file, old, ret = File.new(log, 'w'),
|
||||||
$stdout.reopen(File.new(log, 'w'))
|
$stdout.dup, nil; $stdout.reopen(file)
|
||||||
yield
|
ret = yield; $stdout = old; return ret
|
||||||
$stdout = old
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ class Treat::Helpers::String
|
||||||
if @@cc_cache[o_phrase]
|
if @@cc_cache[o_phrase]
|
||||||
return @@cc_cache[o_phrase]
|
return @@cc_cache[o_phrase]
|
||||||
end
|
end
|
||||||
if Treat.core.acronyms.include?(phrase)
|
if Treat.core.acronyms.include?(phrase.downcase)
|
||||||
phrase = phrase.upcase
|
phrase = phrase.upcase
|
||||||
else
|
else
|
||||||
phrase.gsub!(Regex) { |a| a.upcase }
|
phrase.gsub!(Regex) { |a| a.upcase }
|
||||||
|
@ -99,12 +99,19 @@ class Treat::Helpers::String
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Determines whether module is
|
||||||
|
# an "-able" mixin kind of thing.
|
||||||
|
module IsMixin
|
||||||
|
def is_mixin?; to_s[-4..-1] == 'able'; end
|
||||||
|
end
|
||||||
|
|
||||||
# Graft the helpers onto the string module.
|
# Graft the helpers onto the string module.
|
||||||
String.class_eval do
|
String.class_eval do
|
||||||
include Treat::Helpers::String::CamelCaseable
|
include Treat::Helpers::String::CamelCaseable
|
||||||
include Treat::Helpers::String::UnCamelCaseable
|
include Treat::Helpers::String::UnCamelCaseable
|
||||||
include Treat::Helpers::String::Escapable
|
include Treat::Helpers::String::Escapable
|
||||||
include Treat::Helpers::String::Unescapable
|
include Treat::Helpers::String::Unescapable
|
||||||
|
include Treat::Helpers::String::IsMixin
|
||||||
end
|
end
|
||||||
|
|
||||||
# Graft camel casing onto symbols.
|
# Graft camel casing onto symbols.
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
# Handles the verbosity for external
|
|
||||||
# programs (gems, binaries, etc.)
|
|
||||||
module Treat::Helpers::Verbosity
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end
|
|
|
@ -63,7 +63,7 @@ class Treat::Learning::Problem
|
||||||
# all of the features.
|
# all of the features.
|
||||||
def export_features(e, include_answer = true)
|
def export_features(e, include_answer = true)
|
||||||
features = export(e, @features)
|
features = export(e, @features)
|
||||||
return features unless include_answer
|
return features if !include_answer
|
||||||
features << (e.has?(@question.name) ?
|
features << (e.has?(@question.name) ?
|
||||||
e.get(@question.name) : @question.default)
|
e.get(@question.name) : @question.default)
|
||||||
features
|
features
|
||||||
|
@ -80,9 +80,11 @@ class Treat::Learning::Problem
|
||||||
|
|
||||||
def export(entity, exports)
|
def export(entity, exports)
|
||||||
unless @question.target == entity.type
|
unless @question.target == entity.type
|
||||||
|
targ, type = @question.target, entity.type
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"This classification problem targets #{@question.target}s, " +
|
"This classification problem targets " +
|
||||||
"but a(n) #{entity.type} was passed to export instead."
|
"#{targ}s, but a(n) #{type} " +
|
||||||
|
"was passed to export instead."
|
||||||
end
|
end
|
||||||
ret = []
|
ret = []
|
||||||
exports.each do |export|
|
exports.each do |export|
|
||||||
|
@ -116,9 +118,8 @@ class Treat::Learning::Problem
|
||||||
question = Treat::Learning::Question.new(
|
question = Treat::Learning::Question.new(
|
||||||
hash['question']['name'],
|
hash['question']['name'],
|
||||||
hash['question']['target'],
|
hash['question']['target'],
|
||||||
hash['question']['type'],
|
|
||||||
hash['question']['default'],
|
hash['question']['default'],
|
||||||
hash['question']['labels']
|
hash['question']['type']
|
||||||
)
|
)
|
||||||
features = []
|
features = []
|
||||||
hash['features'].each do |feature|
|
hash['features'].each do |feature|
|
||||||
|
|
|
@ -16,12 +16,9 @@ class Treat::Learning::Question
|
||||||
attr_reader :type
|
attr_reader :type
|
||||||
# Default for the answer to the question.
|
# Default for the answer to the question.
|
||||||
attr_reader :default
|
attr_reader :default
|
||||||
# A list of possible answers to the question.
|
|
||||||
attr_reader :labels
|
|
||||||
|
|
||||||
# Initialize the question.
|
# Initialize the question.
|
||||||
def initialize(name, target,
|
def initialize(name, target, default = nil, type = :continuous)
|
||||||
type = :continuous, default = nil, labels = [])
|
|
||||||
unless name.is_a?(Symbol)
|
unless name.is_a?(Symbol)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Question name should be a symbol."
|
"Question name should be a symbol."
|
||||||
|
@ -35,8 +32,8 @@ class Treat::Learning::Question
|
||||||
raise Treat::Exception, "Type should be " +
|
raise Treat::Exception, "Type should be " +
|
||||||
"continuous or discrete."
|
"continuous or discrete."
|
||||||
end
|
end
|
||||||
@name, @target, @type, @default, @labels =
|
@name, @target, @type, @default =
|
||||||
name, target, type, default, labels
|
name, target, type, default
|
||||||
end
|
end
|
||||||
|
|
||||||
# Custom comparison operator for questions.
|
# Custom comparison operator for questions.
|
||||||
|
@ -44,8 +41,7 @@ class Treat::Learning::Question
|
||||||
@name == question.name &&
|
@name == question.name &&
|
||||||
@type == question.type &&
|
@type == question.type &&
|
||||||
@target == question.target &&
|
@target == question.target &&
|
||||||
@default == question.default &&
|
@default == question.default
|
||||||
@labels = question.labels
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -0,0 +1,52 @@
|
||||||
|
class Treat::Loaders::BindIt
|
||||||
|
|
||||||
|
# Keep track of whether its loaded or not.
|
||||||
|
@@loaded = {}
|
||||||
|
|
||||||
|
# Load CoreNLP package for a given language.
|
||||||
|
def self.load(klass, name, language = nil)
|
||||||
|
|
||||||
|
return if @@loaded[klass]
|
||||||
|
|
||||||
|
language ||= Treat.core.language.default
|
||||||
|
|
||||||
|
jar_path = Treat.libraries[name].jar_path ||
|
||||||
|
Treat.paths.bin + "#{name}/"
|
||||||
|
model_path = Treat.libraries[name].model_path ||
|
||||||
|
Treat.paths.models + "#{name}/"
|
||||||
|
|
||||||
|
if !File.directory?(jar_path)
|
||||||
|
raise Treat::Exception, "Looking for #{klass} " +
|
||||||
|
"library JAR files in #{jar_path}, but it is " +
|
||||||
|
"not a directory. Please set the config option " +
|
||||||
|
"Treat.libraries.#{name}.jar_path to a folder " +
|
||||||
|
"containing the appropriate JAR files."
|
||||||
|
end
|
||||||
|
|
||||||
|
if !File.directory?(model_path)
|
||||||
|
raise Treat::Exception, "Looking for #{klass} " +
|
||||||
|
"library model files in #{model_path}, but it " +
|
||||||
|
"is not a directory. Please set the config option " +
|
||||||
|
"Treat.libraries.#{name}.model_path to a folder " +
|
||||||
|
"containing the appropriate JAR files."
|
||||||
|
end
|
||||||
|
|
||||||
|
klass.jar_path = jar_path
|
||||||
|
klass.model_path = model_path
|
||||||
|
klass.use language
|
||||||
|
|
||||||
|
if Treat.core.verbosity.silence
|
||||||
|
if Gem.win_platform?
|
||||||
|
klass.log_file = 'NUL'
|
||||||
|
else
|
||||||
|
klass.log_file = '/dev/null'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
klass.bind
|
||||||
|
|
||||||
|
@@loaded[klass] = true
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -10,14 +10,13 @@ class Treat::Loaders::Linguistics
|
||||||
# to the supplied language; raises an exception
|
# to the supplied language; raises an exception
|
||||||
# if there is no such language class registered.
|
# if there is no such language class registered.
|
||||||
def self.load(language)
|
def self.load(language)
|
||||||
silence_warnings do
|
code = language.to_s[0..1].intern # FIX
|
||||||
# Linguistics throws warnings; silence them.
|
unless @@languages[language]
|
||||||
silence_warnings { require 'linguistics' }
|
require 'linguistics'
|
||||||
code = language.to_s[0..1].upcase
|
Linguistics.use(code)
|
||||||
@@languages[language] ||=
|
@@languages[language] = true
|
||||||
::Linguistics.const_get(code)
|
|
||||||
end
|
end
|
||||||
return @@languages[language]
|
code
|
||||||
rescue RuntimeError
|
rescue RuntimeError
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Ruby Linguistics does not have a module " +
|
"Ruby Linguistics does not have a module " +
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
require_relative 'bind_it'
|
||||||
|
|
||||||
|
# A helper class to load the OpenNLP package.
|
||||||
|
class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt
|
||||||
|
|
||||||
|
def self.load(language = nil)
|
||||||
|
require 'open-nlp'
|
||||||
|
super(OpenNLP, :open_nlp, language)
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,24 +1,20 @@
|
||||||
# A helper class to load the CoreNLP package.
|
require_relative 'bind_it'
|
||||||
class Treat::Loaders::Stanford
|
|
||||||
|
|
||||||
# Keep track of whether its loaded or not.
|
|
||||||
@@loaded = false
|
|
||||||
|
|
||||||
# Load CoreNLP package for a given language.
|
# A helper class to load the CoreNLP package.
|
||||||
def self.load(language = nil)
|
class Treat::Loaders::Stanford < Treat::Loaders::BindIt
|
||||||
return if @@loaded
|
|
||||||
require 'stanford-core-nlp'
|
|
||||||
language ||= Treat.core.language.default
|
|
||||||
StanfordCoreNLP.jar_path =
|
|
||||||
Treat.libraries.stanford.jar_path ||
|
|
||||||
Treat.paths.bin + 'stanford/'
|
|
||||||
StanfordCoreNLP.model_path =
|
|
||||||
Treat.libraries.stanford.model_path ||
|
|
||||||
Treat.paths.models + 'stanford/'
|
|
||||||
StanfordCoreNLP.use(language)
|
|
||||||
StanfordCoreNLP.log_file = '/dev/null' if
|
|
||||||
Treat.core.verbosity.silence
|
|
||||||
StanfordCoreNLP.bind; @@loaded = true
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
def self.load(language = nil)
|
||||||
|
require 'stanford-core-nlp'
|
||||||
|
super(StanfordCoreNLP, :stanford, language)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.find_model(name, language)
|
||||||
|
language = language.intern
|
||||||
|
model_file = StanfordCoreNLP::Config::Models[name][language]
|
||||||
|
model_dir = StanfordCoreNLP::Config::ModelFolders[name]
|
||||||
|
model_path = Treat.libraries.stanford.model_path ||
|
||||||
|
File.join(Treat.paths.models, 'stanford')
|
||||||
|
File.join(model_path, model_dir, model_file)
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,13 +1,13 @@
|
||||||
module Treat
|
module Treat
|
||||||
|
|
||||||
|
# Contains common utility/helper functions.
|
||||||
|
module Helpers; include Autoload; end
|
||||||
|
|
||||||
# Contains all the configuration options.
|
# Contains all the configuration options.
|
||||||
module Config; include Autoload; end
|
module Config; include Autoload; end
|
||||||
|
|
||||||
# Load all the configuration options.
|
# Import all the configuration options.
|
||||||
Treat::Config.configure!
|
Treat::Config.import!
|
||||||
|
|
||||||
# Contains common utility/helper functions.
|
|
||||||
module Helpers; include Autoload; end
|
|
||||||
|
|
||||||
# Contains classes to load external libraries.
|
# Contains classes to load external libraries.
|
||||||
module Loaders; include Autoload; end
|
module Loaders; include Autoload; end
|
||||||
|
@ -20,7 +20,10 @@ module Treat
|
||||||
|
|
||||||
# Contains all the worker categories.
|
# Contains all the worker categories.
|
||||||
module Workers; include Autoload; end
|
module Workers; include Autoload; end
|
||||||
|
|
||||||
|
# Make all the worker categories.
|
||||||
|
Treat::Workers.categorize!
|
||||||
|
|
||||||
# Installs builders on core Ruby objects.
|
# Installs builders on core Ruby objects.
|
||||||
module Proxies; include Autoload; end
|
module Proxies; include Autoload; end
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
module Treat::Proxies
|
||||||
|
|
||||||
|
module Array
|
||||||
|
# Include base proxy functionality.
|
||||||
|
include Treat::Proxies::Proxy
|
||||||
|
def method_missing(sym, *args, &block)
|
||||||
|
if [:do, :apply].include?(sym) ||
|
||||||
|
Treat::Workers.lookup(sym)
|
||||||
|
map do |el|
|
||||||
|
if el.is_a?(Treat::Entities::Entity)
|
||||||
|
el.send(sym, *args)
|
||||||
|
else
|
||||||
|
el.to_entity.send(sym, *args)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
else
|
||||||
|
super(sym, *args, &block)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Include Treat methods on numerics.
|
||||||
|
::Array.class_eval do
|
||||||
|
include Treat::Proxies::Array
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -21,17 +21,18 @@ module Treat::Proxies
|
||||||
!Treat.core.language.detect
|
!Treat.core.language.detect
|
||||||
|
|
||||||
if is_a?(Treat::Entities::Symbol) ||
|
if is_a?(Treat::Entities::Symbol) ||
|
||||||
is_a?(Treat::Entities::Number)
|
is_a?(Treat::Entities::Number) ||
|
||||||
|
is_a?(Treat::Entities::Punctuation)
|
||||||
return Treat.core.language.default
|
return Treat.core.language.default
|
||||||
end
|
end
|
||||||
|
|
||||||
dlvl = Treat.core.language.detect_at
|
dlvl = Treat.core.language.detect_at
|
||||||
dklass = Treat::Entities.const_get(dlvl.cc)
|
dklass = Treat::Entities.const_get(dlvl.cc)
|
||||||
|
|
||||||
if self.class.compare_with(
|
if self.class.compare_with(dklass) < 1
|
||||||
dklass) < 1 && has_parent?
|
|
||||||
anc = ancestor_with_type(dlvl)
|
anc = ancestor_with_type(dlvl)
|
||||||
return anc.language if anc
|
return anc.language if anc
|
||||||
|
return self.parent.language if has_parent?
|
||||||
end
|
end
|
||||||
|
|
||||||
extractor ||= Treat.workers.
|
extractor ||= Treat.workers.
|
||||||
|
|
|
@ -10,15 +10,16 @@ module Treat::Proxies
|
||||||
# object and send the method call to the entity.
|
# object and send the method call to the entity.
|
||||||
def method_missing(sym, *args, &block)
|
def method_missing(sym, *args, &block)
|
||||||
if [:do, :apply].include?(sym) ||
|
if [:do, :apply].include?(sym) ||
|
||||||
Treat::Workers::Category.lookup(sym)
|
Treat::Workers.lookup(sym)
|
||||||
to_entity.send(sym, *args)
|
to_entity.send(sym, *args)
|
||||||
else
|
else
|
||||||
super(sym, *args, &block)
|
super(sym, *args, &block)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Create an unknown type of entity by default.
|
# Create an unknown type of entity by default.
|
||||||
def to_entity(builder = nil)
|
def to_entity(builder = nil)
|
||||||
Treat::Entities::Unknown(self.to_s)
|
Treat::Entities::Unknown.new(self.to_s)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
module Treat
|
module Treat
|
||||||
|
|
||||||
# The current version of Treat.
|
# The current version of Treat.
|
||||||
VERSION = "1.2.0"
|
VERSION = '2.1.0'
|
||||||
|
|
||||||
# Treat requires Ruby >= 1.9.2
|
# Treat requires Ruby >= 1.9.2
|
||||||
if RUBY_VERSION < '1.9.2'
|
if RUBY_VERSION < '1.9.2'
|
||||||
raise "Treat requires Ruby version 1.9.2 " +
|
raise "Treat requires Ruby version 1.9.2 " +
|
||||||
"or higher, but current is #{RUBY_VERSION}."
|
"or higher, but current is #{RUBY_VERSION}."
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,51 +1,49 @@
|
||||||
# This module creates all the worker categories
|
# This module creates all the worker categories
|
||||||
# and the groups within these categories and adds
|
# and the groups within these categories and adds
|
||||||
# the relevant hooks on the appropriate entities.
|
# the relevant hooks on the appropriate entities.
|
||||||
module Treat::Workers::Category
|
module Treat::Workers::Categorizable
|
||||||
|
|
||||||
require_relative 'group'
|
require_relative 'groupable'
|
||||||
|
|
||||||
# A lookup table for entity types.
|
# A lookup table for entity types.
|
||||||
@@lookup = {}
|
@@lookup = {}
|
||||||
|
|
||||||
# Find a worker group based on method.
|
# Find a worker group based on method.
|
||||||
def self.lookup(method)
|
def lookup(method); @@lookup[method]; end
|
||||||
@@lookup[method]
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.create_categories
|
def categorize!
|
||||||
Treat.workers.members.each do |cat|
|
Treat.workers.members.each do |cat|
|
||||||
create_category(cat.
|
name = cat.capitalize.intern
|
||||||
capitalize.intern,
|
conf = load_category_conf(cat)
|
||||||
load_category_conf(cat))
|
create_category(name, conf)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.load_category_conf(name)
|
def load_category_conf(name)
|
||||||
config = Treat.workers[name]
|
if !Treat.workers.respond_to?(name)
|
||||||
if config.nil?
|
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"The configuration file " +
|
"The configuration file " +
|
||||||
"for #{cat_sym} is missing."
|
"for #{cat_sym} is missing."
|
||||||
|
else
|
||||||
|
Treat.workers[name]
|
||||||
end
|
end
|
||||||
config
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.create_category(name, conf)
|
def create_category(name, conf)
|
||||||
category = Treat::Workers.
|
category = Treat::Workers.
|
||||||
const_set(name, Module.new)
|
const_set(name, Module.new)
|
||||||
conf.each_pair do |group, worker|
|
conf.each_pair do |group, worker|
|
||||||
name = group.to_s.cc.intern
|
name = group.to_s.cc.intern
|
||||||
category.module_eval do
|
category.module_eval do
|
||||||
@@methods = []; def methods;
|
@@methods = []
|
||||||
@@methods; end; def groups;
|
def methods; @@methods; end
|
||||||
self.constants; end
|
def groups; self.constants; end
|
||||||
end
|
end
|
||||||
self.create_group(name, worker, category)
|
create_group(name, worker, category)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.create_group(name, conf, category)
|
def create_group(name, conf, category)
|
||||||
group = category.const_set(name, Module.new)
|
group = category.const_set(name, Module.new)
|
||||||
self.set_group_options(group, conf)
|
self.set_group_options(group, conf)
|
||||||
self.bind_group_targets(group)
|
self.bind_group_targets(group)
|
||||||
|
@ -54,27 +52,9 @@ module Treat::Workers::Category
|
||||||
@@lookup[group.method] = group
|
@@lookup[group.method] = group
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.bind_group_targets(group)
|
def set_group_options(group, conf)
|
||||||
group.targets.each do |entity_type|
|
|
||||||
entity = Treat::Entities.
|
|
||||||
const_get(entity_type.cc)
|
|
||||||
entity.class_eval do
|
|
||||||
add_workers group
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.register_group_presets(group, conf)
|
|
||||||
return unless conf.respond_to? :presets
|
|
||||||
conf.presets.each do |m|
|
|
||||||
@@methods << m
|
|
||||||
@@lookup[m] = group
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.set_group_options(group, conf)
|
|
||||||
group.module_eval do
|
group.module_eval do
|
||||||
extend Treat::Workers::Group
|
extend Treat::Workers::Groupable
|
||||||
self.type = conf.type
|
self.type = conf.type
|
||||||
self.targets = conf.targets
|
self.targets = conf.targets
|
||||||
if conf.respond_to?(:default)
|
if conf.respond_to?(:default)
|
||||||
|
@ -92,6 +72,22 @@ module Treat::Workers::Category
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
self.create_categories
|
def bind_group_targets(group)
|
||||||
|
group.targets.each do |entity_type|
|
||||||
|
entity = Treat::Entities.
|
||||||
|
const_get(entity_type.cc)
|
||||||
|
entity.class_eval do
|
||||||
|
add_workers group
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def register_group_presets(group, conf)
|
||||||
|
return unless conf.respond_to?(:presets)
|
||||||
|
conf.presets.each do |method|
|
||||||
|
@@methods << method
|
||||||
|
@@lookup[method] = group
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -0,0 +1,35 @@
|
||||||
|
# The C extension uses char* strings, and so Unicode strings
|
||||||
|
# will give incorrect distances. Need to provide a pure
|
||||||
|
# implementation if that's the case (FIX).
|
||||||
|
class Treat::Workers::Extractors::Distance::Levenshtein
|
||||||
|
|
||||||
|
require 'levenshtein'
|
||||||
|
|
||||||
|
DefaultOptions = {
|
||||||
|
ins_cost: 1,
|
||||||
|
del_cost: 1,
|
||||||
|
sub_cost: 1
|
||||||
|
}
|
||||||
|
|
||||||
|
@@matcher = nil
|
||||||
|
|
||||||
|
# Return the levensthein distance between
|
||||||
|
# two strings taking into account the costs
|
||||||
|
# of insertion, deletion, and substitution.
|
||||||
|
def self.distance(entity, options)
|
||||||
|
|
||||||
|
options = DefaultOptions.merge(options)
|
||||||
|
|
||||||
|
unless options[:to]
|
||||||
|
raise Treat::Exception, "Must supply " +
|
||||||
|
"a string/entity to compare to using " +
|
||||||
|
"the option :to for this worker."
|
||||||
|
end
|
||||||
|
|
||||||
|
a, b = entity.to_s, options[:to].to_s
|
||||||
|
|
||||||
|
Levenshtein.distance(a, b)
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -23,19 +23,16 @@ class Treat::Workers::Extractors::Keywords::TfIdf
|
||||||
|
|
||||||
tf_idfs = tf_idfs.
|
tf_idfs = tf_idfs.
|
||||||
sort_by {|k,v| v}.reverse
|
sort_by {|k,v| v}.reverse
|
||||||
|
|
||||||
if tf_idfs.size <= options[:number]
|
|
||||||
return tf_idfs
|
|
||||||
end
|
|
||||||
|
|
||||||
keywords = []
|
keywords = []
|
||||||
i = 0
|
i = 0
|
||||||
|
max_count = tf_idfs.size < options[:number] ? tf_idfs.size : options[:number]
|
||||||
|
|
||||||
tf_idfs.each do |word|
|
tf_idfs.each do |word|
|
||||||
|
|
||||||
w = word[0].to_s
|
w = word[0].to_s
|
||||||
next if keywords.include?(w)
|
next if keywords.include?(w)
|
||||||
break if i > options[:number]
|
break if i > max_count
|
||||||
keywords << w
|
keywords << w
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# Language detection using a probabilistic algorithm
|
# Language detection using a probabilistic algorithm
|
||||||
# that checks for the presence of words with Bloom
|
# that checks for the presence of words with Bloom
|
||||||
# filters built from dictionaries for each language.
|
# filters built from dictionaries for each language.
|
||||||
#
|
#
|
||||||
# Original paper: Grothoff. 2007. A Quick Introduction to
|
# Original paper: Grothoff. 2007. A Quick Introduction to
|
||||||
# Bloom Filters. Department of Computer Sciences, Purdue
|
# Bloom Filters. Department of Computer Sciences, Purdue
|
||||||
# University.
|
# University.
|
||||||
class Treat::Workers::Extractors::Language::WhatLanguage
|
class Treat::Workers::Extractors::Language::WhatLanguage
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ class Treat::Workers::Extractors::Language::WhatLanguage
|
||||||
|
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
|
|
||||||
@@detector ||= ::WhatLanguage.new(:possibilities)
|
@@detector ||= ::WhatLanguage.new(:all)
|
||||||
possibilities = @@detector.process_text(entity.to_s)
|
possibilities = @@detector.process_text(entity.to_s)
|
||||||
lang = {}
|
lang = {}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Named entity tag extraction using the Stanford NLP
|
# Named entity tag extraction using the Stanford NLP
|
||||||
# Deterministic Coreference Resolver, which implements a
|
# Deterministic Coreference Resolver, which implements a
|
||||||
# multi-pass sieve coreference resolution (or anaphora
|
# multi-pass sieve coreference resolution (or anaphora
|
||||||
# resolution) system.
|
# resolution) system based on conditional random fields.
|
||||||
#
|
#
|
||||||
# Original paper: Heeyoung Lee, Yves Peirsman, Angel
|
# Original paper: Heeyoung Lee, Yves Peirsman, Angel
|
||||||
# Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
|
# Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
|
||||||
|
@ -16,32 +16,24 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
||||||
|
|
||||||
def self.name_tag(entity, options = {})
|
def self.name_tag(entity, options = {})
|
||||||
|
|
||||||
pp = nil
|
|
||||||
|
|
||||||
language = entity.language
|
language = entity.language
|
||||||
|
|
||||||
Treat::Loaders::Stanford.load(language)
|
Treat::Loaders::Stanford.load(language)
|
||||||
|
|
||||||
isolated_token = entity.is_a?(Treat::Entities::Token)
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
||||||
tokens = isolated_token ? [entity] : entity.tokens
|
tokens = isolated_token ? [entity] : entity.tokens
|
||||||
|
|
||||||
ms = StanfordCoreNLP::Config::Models[:ner][language]
|
unless classifier = @@classifiers[language]
|
||||||
model_path = Treat.libraries.stanford.model_path ||
|
model = Treat::Loaders::Stanford.find_model(:ner, language)
|
||||||
(Treat.paths.models + '/stanford/')
|
unless StanfordCoreNLP.const_defined?('CRFClassifier')
|
||||||
ms = model_path + '/' +
|
StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
|
||||||
StanfordCoreNLP::Config::ModelFolders[:ner] +
|
end
|
||||||
ms['3class']
|
classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
|
||||||
|
@@classifiers[language] = classifier
|
||||||
@@classifiers[language] ||=
|
end
|
||||||
StanfordCoreNLP::CRFClassifier.
|
|
||||||
getClassifier(ms)
|
|
||||||
|
|
||||||
token_list = StanfordCoreNLP.get_list(tokens)
|
token_list = StanfordCoreNLP.get_list(tokens)
|
||||||
sentence = @@classifiers[language].
|
sentence = classifier.classify_sentence(token_list)
|
||||||
classify_sentence(token_list)
|
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
n = 0
|
|
||||||
|
|
||||||
sentence.each do |s_token|
|
sentence.each do |s_token|
|
||||||
tag = s_token.get(:answer).to_s.downcase
|
tag = s_token.get(:answer).to_s.downcase
|
||||||
|
@ -49,14 +41,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
||||||
return tag if isolated_token
|
return tag if isolated_token
|
||||||
if tag
|
if tag
|
||||||
tokens[i].set :name_tag, tag
|
tokens[i].set :name_tag, tag
|
||||||
n += 1
|
|
||||||
end
|
end
|
||||||
i += 1
|
i += 1
|
||||||
end
|
end
|
||||||
|
|
||||||
entity.set :named_entity_count, n
|
|
||||||
|
|
||||||
nil
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
# Similarity measure for short strings such as person names.
|
||||||
|
# C extension won't work for Unicode strings; need to set
|
||||||
|
# extension to "pure" in that case (FIX).
|
||||||
|
class Treat::Workers::Extractors::Similarity::JaroWinkler
|
||||||
|
|
||||||
|
require 'fuzzystringmatch'
|
||||||
|
|
||||||
|
DefaultOptions = {
|
||||||
|
threshold: 0.7,
|
||||||
|
implementation: nil
|
||||||
|
}
|
||||||
|
|
||||||
|
@@matcher = nil
|
||||||
|
|
||||||
|
def self.similarity(entity, options={})
|
||||||
|
|
||||||
|
options = DefaultOptions.merge(options)
|
||||||
|
|
||||||
|
unless options[:to]
|
||||||
|
raise Treat::Exception, "Must supply " +
|
||||||
|
"a string/entity to compare to using " +
|
||||||
|
"the option :to for this worker."
|
||||||
|
end
|
||||||
|
|
||||||
|
unless @@matcher
|
||||||
|
impl = options[:implementation]
|
||||||
|
impl ||= defined?(JRUBY_VERSION) ? :pure : :native
|
||||||
|
klass = FuzzyStringMatch::JaroWinkler
|
||||||
|
@@matcher = klass.create(impl)
|
||||||
|
end
|
||||||
|
|
||||||
|
a, b = entity.to_s, options[:to].to_s
|
||||||
|
|
||||||
|
@@matcher.getDistance(a, b)
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -0,0 +1,43 @@
|
||||||
|
# Calculates the TF*IDF score of words.
|
||||||
|
class Treat::Workers::Extractors::Similarity::TfIdf
|
||||||
|
|
||||||
|
require 'tf-idf-similarity'
|
||||||
|
|
||||||
|
def self.similarity(entity, options={})
|
||||||
|
|
||||||
|
raise 'Not currently implemented.'
|
||||||
|
|
||||||
|
unless options[:to] &&
|
||||||
|
options[:to].type == :document
|
||||||
|
raise Treat::Exception, 'Must supply ' +
|
||||||
|
'a document to compare to using ' +
|
||||||
|
'the option :to for this worker.'
|
||||||
|
end
|
||||||
|
|
||||||
|
unless options[:to].parent_collection &&
|
||||||
|
entity.parent_collection
|
||||||
|
raise Treat::Exception, 'The TF*IDF ' +
|
||||||
|
'similarity algorithm can only be applied ' +
|
||||||
|
'to documents that are inside collections.'
|
||||||
|
end
|
||||||
|
|
||||||
|
coll = TfIdfSimilarity::Collection.new
|
||||||
|
|
||||||
|
entity.each_document do |doc|
|
||||||
|
tdoc = TfIdfSimilarity::Document.new(doc.to_s)
|
||||||
|
term_counts = Hash.new(0)
|
||||||
|
doc.each_word do |word|
|
||||||
|
val = word.value.downcase
|
||||||
|
term_counts[val] ||= 0.0
|
||||||
|
term_counts[val] += 1.0
|
||||||
|
end
|
||||||
|
size = term_counts.values.reduce(:+)
|
||||||
|
tdoc.instance_eval do
|
||||||
|
@term_counts, @size = term_counts, size
|
||||||
|
end
|
||||||
|
coll << tdoc
|
||||||
|
end
|
||||||
|
puts coll.similarity_matrix.inspect
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Time/date extraction using a simple rule-based library.
|
||||||
|
#
|
||||||
|
# Supported formats: Today, yesterday, tomorrow,
|
||||||
|
# last thursday, this thursday, 14 Sep, 14 June 2010.
|
||||||
|
# Any dates without a year are assumed to be in the past.
|
||||||
|
class Treat::Workers::Extractors::Time::Kronic
|
||||||
|
|
||||||
|
require 'kronic'
|
||||||
|
require 'date'
|
||||||
|
|
||||||
|
# Return the date information contained within
|
||||||
|
# the entity by parsing it with the 'chronic' gem.
|
||||||
|
#
|
||||||
|
# Options: none.
|
||||||
|
def self.time(entity, options = {})
|
||||||
|
time = Kronic.parse(entity.to_s)
|
||||||
|
time.is_a?(DateTime) ? time : nil
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -53,9 +53,9 @@ class Treat::Workers::Extractors::TopicWords::LDA
|
||||||
# Run the EM algorithm using random
|
# Run the EM algorithm using random
|
||||||
# starting points
|
# starting points
|
||||||
|
|
||||||
silence_stdout do
|
Treat.core.verbosity.silence ?
|
||||||
lda.em('random')
|
silence_stdout { lda.em('random') } :
|
||||||
end
|
lda.em('random')
|
||||||
|
|
||||||
# Load the vocabulary.
|
# Load the vocabulary.
|
||||||
if options[:vocabulary]
|
if options[:vocabulary]
|
||||||
|
|
|
@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Readers::Autoselect
|
||||||
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
|
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
|
||||||
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
||||||
DefaultOptions = {
|
DefaultOptions = {
|
||||||
:default_to => 'txt'
|
:default_to => 'document'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Choose a reader to use.
|
# Choose a reader to use.
|
||||||
|
@ -12,7 +12,9 @@ class Treat::Workers::Formatters::Readers::Autoselect
|
||||||
# - (Symbol) :default_to => format to default to.
|
# - (Symbol) :default_to => format to default to.
|
||||||
def self.read(document, options = {})
|
def self.read(document, options = {})
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
document.read(detect_format(document.file, options[:default_to]))
|
fmt = detect_format(document.file, options[:default_to])
|
||||||
|
Treat::Workers::Formatters::Readers.
|
||||||
|
const_get(fmt.cc).read(document,options)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.detect_format(filename, default_to = nil)
|
def self.detect_format(filename, default_to = nil)
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
require 'yomu'
|
||||||
|
|
||||||
|
# This class is a wrapper for Yomu.
|
||||||
|
# Yomu is a library for extracting text and metadata from files and documents
|
||||||
|
# using the Apache Tika content analysis toolkit.
|
||||||
|
class Treat::Workers::Formatters::Readers::Document
|
||||||
|
# Extract the readable text from any document.
|
||||||
|
#
|
||||||
|
# Options: none.
|
||||||
|
def self.read(document, options = {})
|
||||||
|
yomu = Yomu.new(document.file)
|
||||||
|
|
||||||
|
document.value = yomu.text
|
||||||
|
document.set :format, yomu.mimetype.extensions.first
|
||||||
|
document
|
||||||
|
end
|
||||||
|
end
|
|
@ -11,7 +11,8 @@ class Treat::Workers::Formatters::Readers::HTML
|
||||||
# By default, don't backup the original HTML
|
# By default, don't backup the original HTML
|
||||||
DefaultOptions = {
|
DefaultOptions = {
|
||||||
:keep_html => false,
|
:keep_html => false,
|
||||||
:tags => %w[p div h1 h2 h3 ul ol dl dt li]
|
:tags => %w[p div h1 h2 h3 ul ol dl dt li img],
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Read the HTML document and strip it of its markup.
|
# Read the HTML document and strip it of its markup.
|
||||||
|
@ -46,6 +47,7 @@ class Treat::Workers::Formatters::Readers::HTML
|
||||||
d = Readability::Document.new(html, options)
|
d = Readability::Document.new(html, options)
|
||||||
document.value = "<h1>#{d.title}</h1>\n" + d.content
|
document.value = "<h1>#{d.title}</h1>\n" + d.content
|
||||||
document.set :format, 'html'
|
document.set :format, 'html'
|
||||||
|
document.set :images, d.images
|
||||||
end
|
end
|
||||||
|
|
||||||
document
|
document
|
||||||
|
|
|
@ -7,8 +7,8 @@
|
||||||
# statistical natural language modeling, and multi-
|
# statistical natural language modeling, and multi-
|
||||||
# lingual capabilities."
|
# lingual capabilities."
|
||||||
#
|
#
|
||||||
# Original paper: Google Ocropus Engine: Breuel,
|
# Original paper: Google Ocropus Engine: Breuel,
|
||||||
# Thomas M. The Ocropus Open Source OCR System.
|
# Thomas M. The Ocropus Open Source OCR System.
|
||||||
# DFKI and U. Kaiserslautern, Germany.
|
# DFKI and U. Kaiserslautern, Germany.
|
||||||
class Treat::Workers::Formatters::Readers::Image
|
class Treat::Workers::Formatters::Readers::Image
|
||||||
|
|
||||||
|
@ -18,29 +18,31 @@ class Treat::Workers::Formatters::Readers::Image
|
||||||
#
|
#
|
||||||
# - (Boolean) :silent => whether to silence Ocropus.
|
# - (Boolean) :silent => whether to silence Ocropus.
|
||||||
def self.read(document, options = {})
|
def self.read(document, options = {})
|
||||||
|
|
||||||
read = lambda do |doc|
|
read = lambda do |doc|
|
||||||
self.create_temp_dir do |tmp|
|
self.create_temp_dir do |tmp|
|
||||||
`ocropus book2pages #{tmp}/out #{doc.file}`
|
`ocropus-nlbin -o #{tmp}/out #{doc.file}`
|
||||||
`ocropus pages2lines #{tmp}/out`
|
`ocropus-gpageseg #{tmp}/out/????.bin.png --minscale 2`
|
||||||
`ocropus lines2fsts #{tmp}/out`
|
`ocropus-rpred #{tmp}/out/????/??????.bin.png`
|
||||||
`ocropus buildhtml #{tmp}/out > #{tmp}/output.html`
|
`ocropus-hocr #{tmp}/out/????.bin.png -o #{tmp}/book.html`
|
||||||
doc.set :file, "#{tmp}/output.html"
|
doc.set :file, "#{tmp}/book.html"
|
||||||
|
doc.set :format, :html
|
||||||
|
|
||||||
doc = doc.read(:html)
|
doc = doc.read(:html)
|
||||||
doc.set :file, f
|
|
||||||
doc.set :format, 'image'
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
options[:silent] ? silence_stdout {
|
Treat.core.verbosity.silence ? silence_stdout {
|
||||||
read.call(document) } : read.call(document)
|
read.call(document) } : read.call(document)
|
||||||
|
|
||||||
document
|
document
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Create a dire that gets deleted after execution of the block.
|
# Create a dir that gets deleted after execution of the block.
|
||||||
def self.create_temp_dir(&block)
|
def self.create_temp_dir(&block)
|
||||||
|
if not FileTest.directory?(Treat.paths.tmp)
|
||||||
|
FileUtils.mkdir(Treat.paths.tmp)
|
||||||
|
end
|
||||||
dname = Treat.paths.tmp +
|
dname = Treat.paths.tmp +
|
||||||
"#{Random.rand(10000000).to_s}"
|
"#{Random.rand(10000000).to_s}"
|
||||||
Dir.mkdir(dname)
|
Dir.mkdir(dname)
|
||||||
|
@ -48,5 +50,5 @@ class Treat::Workers::Formatters::Readers::Image
|
||||||
ensure
|
ensure
|
||||||
FileUtils.rm_rf(dname)
|
FileUtils.rm_rf(dname)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -32,6 +32,9 @@ class Treat::Workers::Formatters::Readers::PDF
|
||||||
# Create a temporary file which is deleted
|
# Create a temporary file which is deleted
|
||||||
# after execution of the block.
|
# after execution of the block.
|
||||||
def self.create_temp_file(ext, value = nil, &block)
|
def self.create_temp_file(ext, value = nil, &block)
|
||||||
|
if not FileTest.directory?(Treat.paths.tmp)
|
||||||
|
FileUtils.mkdir(Treat.paths.tmp)
|
||||||
|
end
|
||||||
fname = Treat.paths.tmp +
|
fname = Treat.paths.tmp +
|
||||||
"#{Random.rand(10000000).to_s}.#{ext}"
|
"#{Random.rand(10000000).to_s}.#{ext}"
|
||||||
File.open(fname, 'w') do |f|
|
File.open(fname, 'w') do |f|
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Treat::Workers::Formatters::Readers::XML
|
||||||
@@xml_reader ||= StanfordCoreNLP.load(
|
@@xml_reader ||= StanfordCoreNLP.load(
|
||||||
:tokenize, :ssplit, :cleanxml)
|
:tokenize, :ssplit, :cleanxml)
|
||||||
|
|
||||||
text = StanfordCoreNLP::Text.new(xml)
|
text = StanfordCoreNLP::Annotation.new(xml)
|
||||||
@@xml_reader.annotate(text)
|
@@xml_reader.annotate(text)
|
||||||
|
|
||||||
text.get(:sentences).each do |sentence|
|
text.get(:sentences).each do |sentence|
|
||||||
|
|
|
@ -9,18 +9,19 @@ class Treat::Workers::Formatters::Serializers::XML
|
||||||
# - (String) :file => a file to write to.
|
# - (String) :file => a file to write to.
|
||||||
def self.serialize(entity, options = {})
|
def self.serialize(entity, options = {})
|
||||||
options[:file] ||= (entity.id.to_s + '.xml')
|
options[:file] ||= (entity.id.to_s + '.xml')
|
||||||
if options[:indent].nil?
|
options[:indent] = 0
|
||||||
options = options.merge({:indent => 0})
|
enc = entity.to_s.encoding.to_s.downcase
|
||||||
end
|
string = "<?xml version=\"1.0\" " +
|
||||||
indent = options[:indent]
|
"encoding=\"#{enc}\" ?>\n<treat>\n"
|
||||||
if options[:indent] == 0
|
val = self.recurse(entity, options)
|
||||||
enc = entity.to_s.encoding.to_s.downcase
|
string += "#{val}\n</treat>"
|
||||||
string = "<?xml version=\"1.0\" " +
|
File.open(options[:file], 'w') do |f|
|
||||||
"encoding=\"#{enc}\" ?>\n<treat>\n"
|
f.write(string)
|
||||||
else
|
end; return options[:file]
|
||||||
string = ''
|
end
|
||||||
end
|
|
||||||
spaces = ''
|
def self.recurse(entity, options)
|
||||||
|
spaces, string = '', ''
|
||||||
options[:indent].times { spaces << ' ' }
|
options[:indent].times { spaces << ' ' }
|
||||||
attributes = " id='#{entity.id}'"
|
attributes = " id='#{entity.id}'"
|
||||||
if !entity.features.nil? && entity.features.size != 0
|
if !entity.features.nil? && entity.features.size != 0
|
||||||
|
@ -56,27 +57,16 @@ class Treat::Workers::Formatters::Serializers::XML
|
||||||
if entity.has_children?
|
if entity.has_children?
|
||||||
options[:indent] += 1
|
options[:indent] += 1
|
||||||
entity.children.each do |child|
|
entity.children.each do |child|
|
||||||
string =
|
string += self.recurse(child, options)
|
||||||
string +
|
|
||||||
serialize(child, options)
|
|
||||||
end
|
end
|
||||||
options[:indent] -= 1
|
options[:indent] -= 1
|
||||||
else
|
else
|
||||||
string = string + "#{escape(entity.value)}"
|
string += "#{escape(entity.value)}"
|
||||||
end
|
end
|
||||||
unless entity.is_a?(Treat::Entities::Token)
|
unless entity.is_a?(Treat::Entities::Token)
|
||||||
string += "#{spaces}"
|
string += "#{spaces}"
|
||||||
end
|
end
|
||||||
string += "</#{tag}>\n"
|
string += "</#{tag}>\n"
|
||||||
if indent == 0
|
|
||||||
string += "\n</treat>"
|
|
||||||
if options[:file]
|
|
||||||
File.open(options[:file], 'w') do |f|
|
|
||||||
f.write(string)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
options[:file]
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.escape(input)
|
def self.escape(input)
|
||||||
|
|
|
@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Serializers::YAML
|
||||||
|
|
||||||
silence_warnings do
|
silence_warnings do
|
||||||
# Require the Psych YAML serializer.
|
# Require the Psych YAML serializer.
|
||||||
require 'psych'
|
require 'yaml'
|
||||||
end
|
end
|
||||||
|
|
||||||
# Serialize an entity in YAML format.
|
# Serialize an entity in YAML format.
|
||||||
|
@ -11,7 +11,7 @@ class Treat::Workers::Formatters::Serializers::YAML
|
||||||
# Options:
|
# Options:
|
||||||
# - (String) :file => a file to write to.
|
# - (String) :file => a file to write to.
|
||||||
def self.serialize(entity, options = {})
|
def self.serialize(entity, options = {})
|
||||||
yaml = ::Psych.dump(entity)
|
yaml = ::YAML.dump(entity)
|
||||||
options[:file] ||= (entity.id.to_s + '.yml')
|
options[:file] ||= (entity.id.to_s + '.yml')
|
||||||
if options[:file]
|
if options[:file]
|
||||||
File.open(options[:file], 'w') do |f|
|
File.open(options[:file], 'w') do |f|
|
||||||
|
|
|
@ -17,7 +17,7 @@ class Treat::Workers::Formatters::Unserializers::Mongo
|
||||||
|
|
||||||
@@database ||= Mongo::Connection.
|
@@database ||= Mongo::Connection.
|
||||||
new(Treat.databases.mongo.host).
|
new(Treat.databases.mongo.host).
|
||||||
db(Treat.databases.mongo.db || db)
|
db(db || Treat.databases.mongo.db)
|
||||||
|
|
||||||
supertype = Treat::Entities.const_get(
|
supertype = Treat::Entities.const_get(
|
||||||
entity.type.to_s.capitalize.intern).superclass.mn.downcase
|
entity.type.to_s.capitalize.intern).superclass.mn.downcase
|
||||||
|
|
|
@ -65,6 +65,7 @@ class Treat::Workers::Formatters::Unserializers::XML
|
||||||
value = v
|
value = v
|
||||||
else
|
else
|
||||||
v = v[1..-1].intern if v[0] == ':'
|
v = v[1..-1].intern if v[0] == ':'
|
||||||
|
v = ":".intern if v == :''
|
||||||
v = v.to_i if v =~ /^[0-9]*$/
|
v = v.to_i if v =~ /^[0-9]*$/
|
||||||
v = v.to_f if v =~ /^[0-9\.]*$/
|
v = v.to_f if v =~ /^[0-9\.]*$/
|
||||||
v = false if v == 'false'
|
v = false if v == 'false'
|
||||||
|
|
|
@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Unserializers::YAML
|
||||||
|
|
||||||
silence_warnings do
|
silence_warnings do
|
||||||
# Require the Psych YAML parser.
|
# Require the Psych YAML parser.
|
||||||
require 'psych'
|
require 'yaml'
|
||||||
end
|
end
|
||||||
|
|
||||||
# Require date to revive DateTime.
|
# Require date to revive DateTime.
|
||||||
|
@ -13,7 +13,7 @@ class Treat::Workers::Formatters::Unserializers::YAML
|
||||||
#
|
#
|
||||||
# Options: none.
|
# Options: none.
|
||||||
def self.unserialize(document, options = {})
|
def self.unserialize(document, options = {})
|
||||||
document << ::Psych.load(
|
document << ::YAML.load(
|
||||||
File.read(document.file))
|
File.read(document.file))
|
||||||
document
|
document
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
module Treat::Workers::Group
|
module Treat::Workers::Groupable
|
||||||
|
|
||||||
# Lazily load the worker classes in the group.
|
# Lazily load the worker classes in the group.
|
||||||
def const_missing(const)
|
def const_missing(const)
|
||||||
bits = self.ancestors[0].to_s.split('::')
|
bits = self.ancestors[0].to_s.split('::')
|
||||||
bits.collect! { |bit| bit.ucc }
|
bits.collect! { |bit| bit.ucc }
|
||||||
file = bits.join('/') + "/#{const.ucc}"
|
file = bits.join('/') + "/#{const.ucc}"
|
||||||
if not File.readable?(Treat.paths.lib + "#{file}.rb")
|
path = Treat.paths.lib + "#{file}.rb"
|
||||||
|
if not File.readable?(path)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"File '#{file}.rb' corresponding to " +
|
"File '#{file}.rb' corresponding to " +
|
||||||
"requested worker #{self}::#{const} " +
|
"requested worker #{self}::#{const} " +
|
||||||
|
@ -14,7 +15,7 @@ module Treat::Workers::Group
|
||||||
require file
|
require file
|
||||||
if not self.const_defined?(const)
|
if not self.const_defined?(const)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"File #{file} does not define " +
|
"File #{file}.rb does not define " +
|
||||||
"#{self}::#{const}."
|
"#{self}::#{const}."
|
||||||
end
|
end
|
||||||
const_get(const)
|
const_get(const)
|
||||||
|
@ -69,9 +70,7 @@ module Treat::Workers::Group
|
||||||
|
|
||||||
# Get constants in this module, excluding by
|
# Get constants in this module, excluding by
|
||||||
# default those defined by parent modules.
|
# default those defined by parent modules.
|
||||||
def const_get(const)
|
def const_get(const); super(const, false); end
|
||||||
super(const, false)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Modify the extended class.
|
# Modify the extended class.
|
||||||
def self.extended(group)
|
def self.extended(group)
|
|
@ -35,9 +35,9 @@ class Treat::Workers::Inflectors::Cardinalizers::Linguistics
|
||||||
# More specific options when using :type => :ordinal:
|
# More specific options when using :type => :ordinal:
|
||||||
def self.cardinal(entity, options = {})
|
def self.cardinal(entity, options = {})
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
Treat::Loaders::Linguistics.
|
lang = entity.language
|
||||||
load(options[:language]).
|
code = Treat::Loaders::Linguistics.load(lang)
|
||||||
numwords(entity.to_s, options)
|
entity.to_s.send(code).numwords(options)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -35,13 +35,15 @@ module Treat::Workers::Inflectors::Conjugators::Linguistics
|
||||||
|
|
||||||
options = Forms[options[:form].to_s] if options[:form]
|
options = Forms[options[:form].to_s] if options[:form]
|
||||||
|
|
||||||
klass = Treat::Loaders::Linguistics.load(entity.language)
|
code = Treat::Loaders::Linguistics.load(entity.language)
|
||||||
|
obj = entity.to_s.send(code)
|
||||||
|
|
||||||
if options[:mode] == 'infinitive'
|
if options[:mode] == 'infinitive'
|
||||||
silence_warnings { klass.infinitive(entity.to_s) }
|
obj.infinitive
|
||||||
elsif options[:mode] == 'participle' && options[:tense] == 'present'
|
elsif options[:mode] == 'participle' && options[:tense] == 'present'
|
||||||
silence_warnings { klass.present_participle(entity.to_s) }
|
obj.present_participle
|
||||||
elsif options[:count] == 'plural' && options.size == 1
|
elsif options[:count] == 'plural' && options.size == 1
|
||||||
silence_warnings { klass.plural_verb(entity.to_s) }
|
obj.plural_verb
|
||||||
else
|
else
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
'This combination of modes, tenses, persons ' +
|
'This combination of modes, tenses, persons ' +
|
||||||
|
|
|
@ -21,9 +21,9 @@ class Treat::Workers::Inflectors::Declensors::English
|
||||||
'option count ("singular" or "plural").'
|
'option count ("singular" or "plural").'
|
||||||
end
|
end
|
||||||
string = entity.to_s
|
string = entity.to_s
|
||||||
if options[:count] == 'plural'
|
if options[:count].to_s == 'plural'
|
||||||
Inflect.plural(string)
|
Inflect.plural(string)
|
||||||
elsif options[:count] == 'singular'
|
elsif options[:count].to_s == 'singular'
|
||||||
Inflect.singular(string)
|
Inflect.singular(string)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -17,34 +17,27 @@ class Treat::Workers::Inflectors::Declensors::Linguistics
|
||||||
|
|
||||||
cat = entity.get(:category)
|
cat = entity.get(:category)
|
||||||
return if cat && !POS.include?(cat)
|
return if cat && !POS.include?(cat)
|
||||||
|
|
||||||
unless options[:count]
|
unless options[:count]
|
||||||
raise Treat::Exception, 'Must supply ' +
|
raise Treat::Exception, 'Must supply ' +
|
||||||
':count option ("singular" or "plural").'
|
':count option ("singular" or "plural").'
|
||||||
end
|
end
|
||||||
|
|
||||||
klass = Treat::Loaders::
|
|
||||||
Linguistics.load(entity.language)
|
|
||||||
string = entity.to_s
|
|
||||||
|
|
||||||
if options[:count] == 'plural'
|
unless options[:count].to_s == 'plural'
|
||||||
if (entity.has?(:category))
|
|
||||||
result = ''
|
|
||||||
silence_warnings do
|
|
||||||
result = klass.send(
|
|
||||||
:"plural_#{entity.category}",
|
|
||||||
string)
|
|
||||||
end
|
|
||||||
return result
|
|
||||||
else
|
|
||||||
return klass.plural(string)
|
|
||||||
end
|
|
||||||
|
|
||||||
else
|
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Ruby Linguistics does not support " +
|
"Ruby Linguistics does not support " +
|
||||||
"singularization of words."
|
"singularization of words."
|
||||||
end
|
end
|
||||||
|
|
||||||
|
lang = entity.language
|
||||||
|
code = Treat::Loaders::Linguistics.load(lang)
|
||||||
|
obj = entity.to_s.send(code)
|
||||||
|
|
||||||
|
if cat = entity.get(:category)
|
||||||
|
method = "plural_#{cat}"
|
||||||
|
obj.send(method)
|
||||||
|
else; obj.plural; end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -11,11 +11,11 @@ class Treat::Workers::Inflectors::Ordinalizers::Linguistics
|
||||||
|
|
||||||
# Desribe a number in words in ordinal form, using the
|
# Desribe a number in words in ordinal form, using the
|
||||||
# 'linguistics' gem.
|
# 'linguistics' gem.
|
||||||
def self.ordinal(number, options = {})
|
def self.ordinal(entity, options = {})
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
klass = Treat::Loaders::
|
lang = entity.language
|
||||||
Linguistics.load(options[:language])
|
code = Treat::Loaders::Linguistics.load(lang)
|
||||||
klass.ordinate(number.to_s)
|
entity.to_s.send(code).ordinate
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -12,24 +12,20 @@ class Treat::Workers::Learners::Classifiers::ID3
|
||||||
@@classifiers = {}
|
@@classifiers = {}
|
||||||
|
|
||||||
def self.classify(entity, options = {})
|
def self.classify(entity, options = {})
|
||||||
|
dset = options[:training]
|
||||||
set = options[:training]
|
prob = dset.problem
|
||||||
cl = set.problem
|
if !@@classifiers[prob]
|
||||||
|
|
||||||
if !@@classifiers[cl]
|
|
||||||
dec_tree = DecisionTree::ID3Tree.new(
|
dec_tree = DecisionTree::ID3Tree.new(
|
||||||
cl.feature_labels.map { |l| l.to_s },
|
prob.feature_labels.map { |l| l.to_s },
|
||||||
set.items.map { |i| i[:features]},
|
dset.items.map { |i| i[:features] },
|
||||||
cl.question.default, cl.question.type)
|
prob.question.default, prob.question.type)
|
||||||
dec_tree.train
|
dec_tree.train
|
||||||
@@classifiers[cl] = dec_tree
|
@@classifiers[prob] = dec_tree
|
||||||
else
|
else
|
||||||
dec_tree = @@classifiers[cl]
|
dec_tree = @@classifiers[prob]
|
||||||
dec_tree.graph('testingbitch')
|
|
||||||
end
|
end
|
||||||
dec_tree.predict(
|
vect = prob.export_features(entity, false)
|
||||||
cl.export_features(entity, false)
|
dec_tree.predict(vect)
|
||||||
)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -11,35 +11,23 @@ class Treat::Workers::Learners::Classifiers::Linear
|
||||||
}
|
}
|
||||||
|
|
||||||
def self.classify(entity, options = {})
|
def self.classify(entity, options = {})
|
||||||
|
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
set = options[:training]
|
dset = options[:training]
|
||||||
problem = set.problem
|
prob, items = dset.problem, dset.items
|
||||||
|
if !@@classifiers[prob]
|
||||||
if !@@classifiers[problem]
|
lparam = LParameter.new
|
||||||
labels = problem.question.labels
|
lparam.solver_type = options[:solver_type]
|
||||||
unless labels
|
lparam.eps = options[:eps]
|
||||||
raise Treat::Exception,
|
lbls = items.map { |it| it[:features][-1] }
|
||||||
"LibLinear requires that you provide the possible " +
|
exs = items.map { |it| it[:features][0..-2] }.
|
||||||
"labels to assign to classification items when " +
|
map { |ary| self.array_to_hash(ary) }
|
||||||
"specifying the question."
|
lprob = LProblem.new(lbls, exs, options[:bias])
|
||||||
end
|
model = LModel.new(lprob, lparam)
|
||||||
param = LParameter.new
|
@@classifiers[prob] = model
|
||||||
param.solver_type = options[:solver_type]
|
|
||||||
param.eps = options[:eps]
|
|
||||||
bias = options[:bias]
|
|
||||||
data = set.items.map do |item|
|
|
||||||
self.array_to_hash(item[:features])
|
|
||||||
end
|
|
||||||
prob = LProblem.new(labels, data, bias)
|
|
||||||
@@classifiers[problem] =
|
|
||||||
LModel.new(prob, param)
|
|
||||||
end
|
end
|
||||||
|
features = prob.export_features(entity, false)
|
||||||
@@classifiers[problem].predict(
|
@@classifiers[prob].predict(
|
||||||
self.array_to_hash(problem.
|
self.array_to_hash(features))
|
||||||
export_features(entity, false)))
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.array_to_hash(array)
|
def self.array_to_hash(array)
|
||||||
|
|
|
@ -1,30 +1,43 @@
|
||||||
# Classification based on a multilayer perceptron.
|
# Classification based on a multilayer perceptron.
|
||||||
class Treat::Workers::Learners::Classifiers::MLP
|
class Treat::Workers::Learners::Classifiers::MLP
|
||||||
|
|
||||||
require 'ai4r'
|
require 'ruby_fann/neural_network'
|
||||||
|
|
||||||
@@mlps = {}
|
DefaultOptions = {
|
||||||
|
num_inputs: 3,
|
||||||
|
hidden_neurons: [2, 8, 4, 3, 4],
|
||||||
|
num_outputs: 1,
|
||||||
|
max_neurons: 1000,
|
||||||
|
neurons_between_reports: 1,
|
||||||
|
desired_error: 0.1
|
||||||
|
}
|
||||||
|
|
||||||
|
@@classifiers = {}
|
||||||
|
|
||||||
def self.classify(entity, options = {})
|
def self.classify(entity, options = {})
|
||||||
|
options = DefaultOptions.merge(options)
|
||||||
set = options[:training]
|
dset = options[:training]
|
||||||
cl = set.problem
|
prob, items = dset.problem, dset.items
|
||||||
|
if !@@classifiers[prob]
|
||||||
if !@@mlps[cl]
|
fann = RubyFann::Standard.new(options)
|
||||||
net = Ai4r::NeuralNetwork::Backpropagation.new(
|
inputs = items.map { |it| it[:features][0..-2] }
|
||||||
[cl.feature_labels.size, 3, 1])
|
outputs = items.map { |it| [it[:features][-1]] }
|
||||||
set.items.each do |item|
|
training = silence_stdout do
|
||||||
inputs = item[:features][0..-2]
|
RubyFann::TrainData.new(inputs:
|
||||||
outputs = [item[:features][-1]]
|
inputs, desired_outputs: outputs)
|
||||||
net.train(inputs, outputs)
|
|
||||||
end
|
end
|
||||||
@@mlps[cl] = net
|
params = [options[:max_neurons],
|
||||||
|
options[:neurons_between_reports],
|
||||||
|
options[:desired_error]]
|
||||||
|
fann.train_on_data(training, *params)
|
||||||
|
@@classifiers[prob] = fann
|
||||||
else
|
else
|
||||||
net = @@mlps[cl]
|
fann = @@classifiers[prob]
|
||||||
end
|
end
|
||||||
|
vect = prob.export_features(entity, false)
|
||||||
net.eval(cl.export_features(entity, false))[0]
|
Treat.core.verbosity.silence ?
|
||||||
|
silence_stdout { fann.run(vect)[0] } :
|
||||||
|
fann.run(vect)[0]
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -5,7 +5,7 @@ class Treat::Workers::Learners::Classifiers::SVM
|
||||||
@@classifiers = {}
|
@@classifiers = {}
|
||||||
|
|
||||||
DefaultOptions = {
|
DefaultOptions = {
|
||||||
cache_size: 1,
|
cache_size: 1, # in MB
|
||||||
eps: 0.001,
|
eps: 0.001,
|
||||||
c: 10
|
c: 10
|
||||||
}
|
}
|
||||||
|
@ -14,35 +14,25 @@ class Treat::Workers::Learners::Classifiers::SVM
|
||||||
# - (Numeric) :eps => tolerance of termination criterion
|
# - (Numeric) :eps => tolerance of termination criterion
|
||||||
# - (Numeric) :c => C parameter
|
# - (Numeric) :c => C parameter
|
||||||
def self.classify(entity, options = {})
|
def self.classify(entity, options = {})
|
||||||
|
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
set = options[:training]
|
dset = options[:training]
|
||||||
problem = set.problem
|
prob, items = dset.problem, dset.items
|
||||||
|
if !@@classifiers[prob]
|
||||||
if !@@classifiers[problem]
|
lprob = Libsvm::Problem.new
|
||||||
labels = problem.question.labels
|
lparam = Libsvm::SvmParameter.new
|
||||||
unless labels
|
lparam.cache_size = options[:cache_size]
|
||||||
raise Treat::Exception,
|
lparam.eps = options[:eps]
|
||||||
"LibSVM requires that you provide the possible " +
|
lparam.c = options[:c]
|
||||||
"labels to assign to classification items when " +
|
llabels = items.map { |it| it[:features][-1] }
|
||||||
"specifying the question."
|
lexamples = items.map { |it| it[:features][0..-2] }.
|
||||||
end
|
map { |ary| Libsvm::Node.features(ary) }
|
||||||
examples = set.items.map { |item| item[:features] }
|
lprob.set_examples(llabels, lexamples)
|
||||||
prob = Libsvm::Problem.new
|
model = Libsvm::Model.train(lprob, lparam)
|
||||||
prob.set_examples(labels, examples)
|
@@classifiers[prob] = model
|
||||||
param = Libsvm::SvmParameter.new
|
|
||||||
param.cache_size = options[:cache_size]
|
|
||||||
param.eps = options[:eps]
|
|
||||||
param.c = options[:c]
|
|
||||||
model = Libsvm::Model.train(problem, parameter)
|
|
||||||
@@classifiers[problem] = model
|
|
||||||
end
|
end
|
||||||
|
features = prob.export_features(entity, false)
|
||||||
features = problem.export_features(entity, false)
|
@@classifiers[prob].predict(
|
||||||
|
Libsvm::Node.features(features))
|
||||||
@@classifiers[problem].predict(
|
|
||||||
Libsvm::Node.features(*features))
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -28,8 +28,9 @@ class Treat::Workers::Lexicalizers::Categorizers::FromTag
|
||||||
|
|
||||||
tag = entity.check_has(:tag)
|
tag = entity.check_has(:tag)
|
||||||
|
|
||||||
return 'unknown' if tag.nil? || tag == '' || entity.type == :symbol
|
return 'unknown' if tag.nil? || tag == ''
|
||||||
return 'sentence' if tag == 'S' || entity.type == :sentence
|
return 'fragment' if tag == 'F'
|
||||||
|
return 'sentence' if tag == 'S'
|
||||||
return 'number' if entity.type == :number
|
return 'number' if entity.type == :number
|
||||||
|
|
||||||
return Ptc[entity.to_s] if entity.type == :punctuation
|
return Ptc[entity.to_s] if entity.type == :punctuation
|
||||||
|
|
|
@ -1,62 +1,79 @@
|
||||||
# Sense information (synonyms, antonyms, hypernyms
|
# Sense information (synonyms, antonyms, hypernyms
|
||||||
# and hyponyms) obtained through a Ruby parser that
|
# and hyponyms) obtained through a Ruby parser that
|
||||||
# accesses Wordnet flat files.
|
# accesses Wordnet flat files.
|
||||||
#
|
#
|
||||||
# Original paper: George A. Miller (1995). WordNet:
|
# Original paper: George A. Miller (1995). WordNet:
|
||||||
# A Lexical Database for English. Communications of
|
# A Lexical Database for English. Communications of
|
||||||
# the ACM Vol. 38, No. 11: 39-41.
|
# the ACM Vol. 38, No. 11: 39-41.
|
||||||
class Treat::Workers::Lexicalizers::Sensers::Wordnet
|
class Treat::Workers::Lexicalizers::Sensers::Wordnet
|
||||||
|
|
||||||
# Require the 'wordnet' gem (install as 'rwordnet').
|
# Require the 'wordnet' gem (install as 'rwordnet').
|
||||||
require 'wordnet'
|
require 'wordnet'
|
||||||
|
|
||||||
# Patch for bug.
|
# Patch for bug.
|
||||||
::WordNet.module_eval do
|
::WordNet.module_eval do
|
||||||
remove_const(:SynsetType)
|
remove_const(:SYNSET_TYPES)
|
||||||
const_set(:SynsetType,
|
const_set(:SYNSET_TYPES,
|
||||||
{"n" => "noun", "v" => "verb", "a" => "adj"})
|
{"n" => "noun", "v" => "verb", "a" => "adj"})
|
||||||
end
|
end
|
||||||
|
|
||||||
# Require an adaptor for Wordnet synsets.
|
# Require an adaptor for Wordnet synsets.
|
||||||
require_relative 'wordnet/synset'
|
require_relative 'wordnet/synset'
|
||||||
|
|
||||||
# Noun, adjective and verb indexes.
|
|
||||||
@@indexes = {}
|
|
||||||
|
|
||||||
# Obtain lexical information about a word using the
|
# Obtain lexical information about a word using the
|
||||||
# ruby 'wordnet' gem.
|
# ruby 'wordnet' gem.
|
||||||
def self.sense(word, options = nil)
|
def self.sense(word, options = nil)
|
||||||
|
|
||||||
category = word.check_has(:category)
|
category = word.check_has(:category)
|
||||||
|
|
||||||
unless options[:nym]
|
if !options[:nym]
|
||||||
raise Treat::Exception, "You must supply " +
|
raise Treat::Exception, "You must supply " +
|
||||||
"the :nym option (:synonym, :hypernym, etc.)"
|
"the :nym option ('synonyms', 'hypernyms', etc.)"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
if !options[:nym].is_a?(Symbol)
|
||||||
|
options[:nym] = options[:nym].intern
|
||||||
|
end
|
||||||
|
|
||||||
|
if ![:synonyms, :antonyms,
|
||||||
|
:hypernyms, :hyponyms].include?(options[:nym])
|
||||||
|
raise Treat::Exception, "You must supply " +
|
||||||
|
"a valid :nym option ('synonyms', 'hypernyms', etc.)"
|
||||||
|
end
|
||||||
|
|
||||||
unless ['noun', 'adjective', 'verb'].
|
unless ['noun', 'adjective', 'verb'].
|
||||||
include?(word.category)
|
include?(word.category)
|
||||||
return []
|
return []
|
||||||
end
|
end
|
||||||
|
|
||||||
cat = category.to_s.capitalize
|
cat = abbreviate(category)
|
||||||
|
|
||||||
@@indexes[cat] ||=
|
lemma = ::WordNet::Lemma.find(word.value.downcase, cat)
|
||||||
::WordNet.const_get(cat + 'Index').instance
|
|
||||||
lemma = @@indexes[cat].find(word.value.downcase)
|
|
||||||
|
|
||||||
return [] if lemma.nil?
|
return [] if lemma.nil?
|
||||||
synsets = []
|
synsets = []
|
||||||
|
|
||||||
lemma.synsets.each do |synset|
|
lemma.synsets.each do |synset|
|
||||||
synsets <<
|
synsets <<
|
||||||
Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
||||||
end
|
end
|
||||||
|
|
||||||
((synsets.collect do |ss|
|
((synsets.collect do |ss|
|
||||||
ss.send(options[:nym])
|
ss.send(options[:nym])
|
||||||
end - [word.value]).flatten).uniq
|
end - [word.value]).
|
||||||
|
flatten).uniq.map do |token|
|
||||||
|
token.gsub('_', ' ')
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
def self.abbreviate category
|
||||||
|
if category == 'adjective'
|
||||||
|
:adj
|
||||||
|
elsif category == 'adverb'
|
||||||
|
:adv
|
||||||
|
else
|
||||||
|
category.to_sym
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
|
@ -40,15 +40,15 @@ class Treat::Workers::Lexicalizers::Taggers::Brill
|
||||||
return pair[1] if isolated_token
|
return pair[1] if isolated_token
|
||||||
end
|
end
|
||||||
|
|
||||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
if entity.is_a?(Treat::Entities::Group) &&
|
||||||
(entity.is_a?(Treat::Entities::Phrase) &&
|
!entity.parent_sentence
|
||||||
!entity.parent_sentence)
|
|
||||||
entity.set :tag_set, :penn
|
entity.set :tag_set, :penn
|
||||||
end
|
end
|
||||||
|
|
||||||
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
||||||
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
||||||
|
return 'F' if entity.is_a?(Treat::Entities::Fragment)
|
||||||
|
return 'G' if entity.is_a?(Treat::Entities::Group)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -61,15 +61,16 @@ class Treat::Workers::Lexicalizers::Taggers::Lingua
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
if entity.is_a?(Treat::Entities::Group) &&
|
||||||
(entity.is_a?(Treat::Entities::Phrase) &&
|
!entity.parent_sentence
|
||||||
!entity.parent_sentence)
|
|
||||||
entity.set :tag_set, :penn
|
entity.set :tag_set, :penn
|
||||||
end
|
end
|
||||||
|
|
||||||
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
||||||
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
||||||
|
return 'F' if entity.is_a?(Treat::Entities::Fragment)
|
||||||
|
return 'G' if entity.is_a?(Treat::Entities::Group)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,15 +1,15 @@
|
||||||
# POS tagging using (i) explicit use of both preceding
|
# POS tagging using a maximum entropy model, with (i)
|
||||||
# and following tag contexts via a dependency network
|
# explicit use of both preceding and following tag
|
||||||
# representation, (ii) broad use of lexical features,
|
# contexts via a dependency network representation,
|
||||||
# including jointly conditioning on multiple consecutive
|
# (ii) broad use of lexical features, including jointly
|
||||||
# words, (iii) effective use of priors in conditional
|
# conditioning on multiple consecutive words, (iii)
|
||||||
# loglinear models, and (iv) fine-grained modeling of
|
# effective use of priors in conditional loglinear models,
|
||||||
# unknown word features.
|
# and (iv) fine-grained modeling of unknown word features.
|
||||||
#
|
#
|
||||||
# Original paper: Toutanova, Manning, Klein and Singer.
|
# Original paper: Toutanova, Manning, Klein and Singer.
|
||||||
# 2003. Feature-Rich Part-of-Speech Tagging with a
|
# 2003. Feature-Rich Part-of-Speech Tagging with a
|
||||||
# Cyclic Dependency Network. In Proceedings of the
|
# Cyclic Dependency Network. In Proceedings of the
|
||||||
# Conference of the North American Chapter of the
|
# Conference of the North American Chapter of the
|
||||||
# Association for Computational Linguistics.
|
# Association for Computational Linguistics.
|
||||||
class Treat::Workers::Lexicalizers::Taggers::Stanford
|
class Treat::Workers::Lexicalizers::Taggers::Stanford
|
||||||
|
|
||||||
|
@ -25,34 +25,32 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
||||||
def self.tag(entity, options = {})
|
def self.tag(entity, options = {})
|
||||||
|
|
||||||
# Handle tags for sentences and phrases.
|
# Handle tags for sentences and phrases.
|
||||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
if entity.is_a?(Treat::Entities::Group) &&
|
||||||
(entity.is_a?(Treat::Entities::Phrase) &&
|
!entity.parent_sentence
|
||||||
!entity.parent_sentence)
|
|
||||||
|
|
||||||
tag_set = options[:tag_set]
|
tag_set = options[:tag_set]
|
||||||
entity.set :tag_set, tag_set
|
entity.set :tag_set, tag_set
|
||||||
end
|
end
|
||||||
|
|
||||||
if entity.is_a?(Treat::Entities::Sentence)
|
return 'S' if entity.is_a?(Treat::Entities::Sentence)
|
||||||
return 'S'
|
return 'P' if entity.is_a?(Treat::Entities::Phrase)
|
||||||
elsif entity.is_a?(Treat::Entities::Phrase)
|
return 'F' if entity.is_a?(Treat::Entities::Fragment)
|
||||||
return 'P'
|
return 'G' if entity.is_a?(Treat::Entities::Group)
|
||||||
end
|
|
||||||
|
|
||||||
# Handle options and initialize the tagger.
|
# Handle options and initialize the tagger.
|
||||||
lang = entity.language
|
lang = entity.language.intern
|
||||||
options = get_options(options, lang)
|
|
||||||
init_tagger(lang) unless @@taggers[lang]
|
init_tagger(lang) unless @@taggers[lang]
|
||||||
tokens, list = get_token_list(entity)
|
options = get_options(options, lang)
|
||||||
|
tokens, t_list = get_token_list(entity)
|
||||||
|
|
||||||
# Do the tagging.
|
# Do the tagging.
|
||||||
i = 0
|
i = 0
|
||||||
isolated_token = entity.is_a?(Treat::Entities::Token)
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
||||||
|
|
||||||
@@taggers[lang].apply(list).each do |tok|
|
@@taggers[lang].apply(t_list).each do |tok|
|
||||||
tokens[i].set :tag, tok.tag
|
tokens[i].set(:tag, tok.tag.split('-').first)
|
||||||
tokens[i].set :tag_set,
|
tokens[i].set(:tag_set,
|
||||||
options[:tag_set] if isolated_token
|
options[:tag_set]) if isolated_token
|
||||||
return tok.tag if isolated_token
|
return tok.tag if isolated_token
|
||||||
i += 1
|
i += 1
|
||||||
end
|
end
|
||||||
|
@ -61,21 +59,24 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
||||||
|
|
||||||
# Initialize the tagger for a language.
|
# Initialize the tagger for a language.
|
||||||
def self.init_tagger(language)
|
def self.init_tagger(language)
|
||||||
Treat::Loaders::Stanford.load(language)
|
unless @@taggers[language]
|
||||||
model = StanfordCoreNLP::Config::Models[:pos][language]
|
Treat::Loaders::Stanford.load(language)
|
||||||
model_path = Treat.libraries.stanford.model_path ||
|
unless StanfordCoreNLP.const_defined?('MaxentTagger')
|
||||||
Treat.paths.models + 'stanford/'
|
StanfordCoreNLP.load_class('MaxentTagger',
|
||||||
model = model_path + StanfordCoreNLP::
|
'edu.stanford.nlp.tagger.maxent')
|
||||||
Config::ModelFolders[:pos] + model
|
end
|
||||||
@@taggers[language] ||=
|
model = Treat::Loaders::Stanford.find_model(:pos,language)
|
||||||
StanfordCoreNLP::MaxentTagger.new(model)
|
tagger = StanfordCoreNLP::MaxentTagger.new(model)
|
||||||
|
@@taggers[language] = tagger
|
||||||
|
end
|
||||||
|
@@taggers[language]
|
||||||
end
|
end
|
||||||
|
|
||||||
# Handle the options for the tagger.
|
# Handle the options for the tagger.
|
||||||
def self.get_options(options, language)
|
def self.get_options(options, language)
|
||||||
options = DefaultOptions.merge(options)
|
options = DefaultOptions.merge(options)
|
||||||
if options[:tagger_model]
|
if options[:tagger_model]
|
||||||
::StanfordCoreNLP.set_model('pos.model',
|
StanfordCoreNLP.set_model('pos.model',
|
||||||
options[:tagger_model])
|
options[:tagger_model])
|
||||||
end
|
end
|
||||||
options[:tag_set] =
|
options[:tag_set] =
|
||||||
|
|
|
@ -2,16 +2,13 @@ class Treat::Workers::Processors::Chunkers::Autoselect
|
||||||
|
|
||||||
def self.chunk(entity, options = {})
|
def self.chunk(entity, options = {})
|
||||||
unless entity.has?(:format)
|
unless entity.has?(:format)
|
||||||
raise Treat::Exception,
|
entity.set :format, 'txt'
|
||||||
"Must have a format to autoselect chunker."
|
|
||||||
end
|
end
|
||||||
begin
|
begin
|
||||||
k = Treat::Workers::Processors::
|
k = Treat::Workers::Processors::Chunkers.const_get(entity.format.cc)
|
||||||
Chunkers.const_get(entity.format.cc)
|
|
||||||
k.chunk(entity, options)
|
k.chunk(entity, options)
|
||||||
rescue Treat::Exception
|
rescue Treat::Exception
|
||||||
Treat::Workers::Processors::
|
Treat::Workers::Processors::Chunkers::TXT.chunk(entity, options)
|
||||||
Chunkers::TXT.chunk(entity, options)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,12 +3,9 @@ class Treat::Workers::Processors::Chunkers::HTML
|
||||||
require 'nokogiri'
|
require 'nokogiri'
|
||||||
|
|
||||||
def self.chunk(entity, options = {})
|
def self.chunk(entity, options = {})
|
||||||
|
|
||||||
entity.check_hasnt_children
|
entity.check_hasnt_children
|
||||||
|
|
||||||
doc = Nokogiri::HTML(entity.value)
|
doc = Nokogiri::HTML(entity.value)
|
||||||
recurse(entity, doc)
|
self.recurse(entity, doc)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.recurse(node, html_node, level = 1)
|
def self.recurse(node, html_node, level = 1)
|
||||||
|
@ -16,7 +13,6 @@ class Treat::Workers::Processors::Chunkers::HTML
|
||||||
html_node.children.each do |child|
|
html_node.children.each do |child|
|
||||||
|
|
||||||
next if child.name == 'text'
|
next if child.name == 'text'
|
||||||
|
|
||||||
txt = child.inner_text
|
txt = child.inner_text
|
||||||
|
|
||||||
if child.name =~ /^h([0-9]{1})$/ ||
|
if child.name =~ /^h([0-9]{1})$/ ||
|
||||||
|
|
|
@ -12,16 +12,13 @@ class Treat::Workers::Processors::Chunkers::TXT
|
||||||
zones.each do |zone|
|
zones.each do |zone|
|
||||||
zone.strip!
|
zone.strip!
|
||||||
next if zone == ''
|
next if zone == ''
|
||||||
c = Treat::Entities::
|
c = Treat::Entities::Zone.from_string(zone)
|
||||||
Zone.from_string(zone)
|
|
||||||
if c.type == :title
|
if c.type == :title
|
||||||
if current.type == :section
|
if current.type == :section
|
||||||
current = current.parent
|
current = current.parent
|
||||||
current = entity << Treat::
|
current = entity << Treat::Entities::Section.new
|
||||||
Entities::Section.new
|
|
||||||
else
|
else
|
||||||
current = entity << Treat::
|
current = entity << Treat::Entities::Section.new
|
||||||
Entities::Section.new
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
current << c
|
current << c
|
||||||
|
|
|
@ -1,150 +1,88 @@
|
||||||
# Parsing using an interface to a Java implementation
|
# Parsing using an interface to a Java implementation
|
||||||
# of probabilistic natural language parsers, both
|
# of probabilistic natural language parsers, both
|
||||||
# optimized PCFG and lexicalized dependency parsers,
|
# optimized PCFG and lexicalized dependency parsers,
|
||||||
# and a lexicalized PCFG parser.
|
# and a lexicalized PCFG parser.
|
||||||
#
|
#
|
||||||
# Original paper: Dan Klein and Christopher D.
|
# Original paper: Dan Klein and Christopher D.
|
||||||
# Manning. 2003. Accurate Unlexicalized Parsing.
|
# Manning. 2003. Accurate Unlexicalized Parsing.
|
||||||
# Proceedings of the 41st Meeting of the Association
|
# Proceedings of the 41st Meeting of the Association
|
||||||
# for Computational Linguistics, pp. 423-430.
|
# for Computational Linguistics, pp. 423-430.
|
||||||
class Treat::Workers::Processors::Parsers::Stanford
|
class Treat::Workers::Processors::Parsers::Stanford
|
||||||
|
|
||||||
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
||||||
|
|
||||||
# Hold one instance of the pipeline per language.
|
# Hold one instance of the pipeline per language.
|
||||||
@@parsers = {}
|
@@parsers = {}
|
||||||
|
|
||||||
DefaultOptions = {
|
DefaultOptions = { model: nil }
|
||||||
:parser_model => nil,
|
|
||||||
:tagger_model => nil
|
|
||||||
}
|
|
||||||
|
|
||||||
# Parse the entity using the Stanford parser.
|
# Parse the entity using the Stanford parser.
|
||||||
#
|
|
||||||
# Options:
|
|
||||||
#
|
|
||||||
# - (Boolean) :silent => whether to silence the output
|
|
||||||
# of the JVM.
|
|
||||||
# - (String) :log_file => a filename to log output to
|
|
||||||
# instead of displaying it.
|
|
||||||
def self.parse(entity, options = {})
|
def self.parse(entity, options = {})
|
||||||
|
|
||||||
entity.check_hasnt_children
|
val, lang = entity.to_s, entity.language.intern
|
||||||
|
|
||||||
val = entity.to_s
|
Treat::Loaders::Stanford.load(lang)
|
||||||
lang = entity.language
|
|
||||||
init(lang, options)
|
|
||||||
|
|
||||||
tag_set = StanfordCoreNLP::Config::TagSets[lang]
|
tag_set = StanfordCoreNLP::Config::TagSets[lang]
|
||||||
|
|
||||||
text = ::StanfordCoreNLP::Text.new(val)
|
list = get_token_list(entity)
|
||||||
@@parsers[lang].annotate(text)
|
entity.remove_all!
|
||||||
|
|
||||||
text.get(:sentences).each do |s|
|
model_file = options[:model] ||
|
||||||
|
StanfordCoreNLP::Config::Models[:parse][lang]
|
||||||
if entity.is_a?(Treat::Entities::Sentence) ||
|
|
||||||
entity.is_a?(Treat::Entities::Phrase)
|
unless @@parsers[lang] && @@parsers[lang][model_file]
|
||||||
tag = s.get(:category).to_s
|
model_path = Treat.libraries.stanford.model_path ||
|
||||||
tag_s, tag_opt = *tag.split('-')
|
StanfordCoreNLP.model_path
|
||||||
tag_s ||= 'S'
|
model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
|
||||||
entity.set :tag, tag_s
|
model = File.join(model_path, model_folder, model_file)
|
||||||
entity.set :tag_opt, tag_opt if tag_opt
|
@@parsers[lang] ||= {}
|
||||||
recurse(s.get(:tree).children[0], entity, tag_set)
|
options = StanfordCoreNLP::Options.new
|
||||||
break #######
|
parser = StanfordCoreNLP::LexicalizedParser
|
||||||
else
|
.getParserFromFile(model, options)
|
||||||
recurse(s.get(:tree), entity, tag_set)
|
@@parsers[lang][model_file] = parser
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
parser = @@parsers[lang][model_file]
|
||||||
|
|
||||||
|
text = parser.apply(list)
|
||||||
|
|
||||||
|
recurse(text.children[0], entity, tag_set)
|
||||||
entity.set :tag_set, tag_set
|
entity.set :tag_set, tag_set
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.init(lang, options)
|
def self.recurse(java_node, ruby_node, tag_set)
|
||||||
return if @@parsers[lang]
|
|
||||||
|
|
||||||
Treat::Loaders::Stanford.load(lang)
|
java_node.children.each do |java_child|
|
||||||
|
|
||||||
options = DefaultOptions.merge(options)
|
|
||||||
StanfordCoreNLP.use(lang)
|
|
||||||
if options[:tagger_model]
|
|
||||||
::StanfordCoreNLP.set_model(
|
|
||||||
'pos.model', options[:tagger_model]
|
|
||||||
)
|
|
||||||
end
|
|
||||||
if options[:parser_model]
|
|
||||||
::StanfordCoreNLP.set_model(
|
|
||||||
'parser.model', options[:parser_model]
|
|
||||||
)
|
|
||||||
end
|
|
||||||
@@parsers[lang] ||=
|
|
||||||
::StanfordCoreNLP.load(
|
|
||||||
:tokenize, :ssplit, :pos, :lemma, :parse
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Helper method which recurses the tree supplied by
|
label = java_child.label
|
||||||
# the Stanford parser.
|
tag = label.get(:category).to_s
|
||||||
def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
|
|
||||||
|
|
||||||
if java_node.num_children == 0
|
if Pttc[tag] && Pttc[tag][tag_set]
|
||||||
|
ruby_child = Treat::Entities::Phrase.new
|
||||||
label = java_node.label
|
ruby_child.set :tag, tag
|
||||||
tag = label.get(:part_of_speech).to_s
|
|
||||||
tag_s, tag_opt = *tag.split('-')
|
|
||||||
tag_s ||= ''
|
|
||||||
ruby_node.value = java_node.value.to_s.strip
|
|
||||||
ruby_node.set :tag, tag_s
|
|
||||||
ruby_node.set :tag_opt, tag_opt if tag_opt
|
|
||||||
ruby_node.set :lemma, label.get(:lemma).to_s
|
|
||||||
|
|
||||||
additional_tags.each do |t|
|
|
||||||
lt = label.get(t)
|
|
||||||
ruby_node.set t, lt.to_s if lt
|
|
||||||
end
|
|
||||||
|
|
||||||
ruby_node
|
|
||||||
|
|
||||||
else
|
|
||||||
|
|
||||||
if java_node.num_children == 1 &&
|
|
||||||
java_node.children[0].num_children == 0
|
|
||||||
recurse(java_node.children[0],
|
|
||||||
ruby_node, tag_set, additional_tags)
|
|
||||||
return
|
|
||||||
end
|
|
||||||
|
|
||||||
java_node.children.each do |java_child|
|
|
||||||
|
|
||||||
label = java_child.label
|
|
||||||
tag = label.get(:category).to_s
|
|
||||||
tag_s, tag_opt = *tag.split('-')
|
|
||||||
tag_s ||= ''
|
|
||||||
|
|
||||||
if Pttc[tag_s] && Pttc[tag_s][tag_set]
|
|
||||||
ruby_child = Treat::Entities::Phrase.new
|
|
||||||
else
|
|
||||||
l = java_child.children[0].to_s
|
|
||||||
v = java_child.children[0].value.to_s.strip
|
|
||||||
|
|
||||||
# Mhmhmhmhmhm
|
|
||||||
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
|
||||||
ruby_child = Treat::Entities::Token.from_string(val)
|
|
||||||
end
|
|
||||||
|
|
||||||
ruby_child.set :tag, tag_s
|
|
||||||
ruby_child.set :tag_opt, tag_opt if tag_opt
|
|
||||||
ruby_node << ruby_child
|
ruby_node << ruby_child
|
||||||
|
|
||||||
unless java_child.children.empty?
|
unless java_child.children.empty?
|
||||||
recurse(java_child, ruby_child, tag_set, additional_tags)
|
recurse(java_child, ruby_child, tag_set)
|
||||||
end
|
end
|
||||||
|
else
|
||||||
|
val = java_child.children[0].to_s
|
||||||
|
ruby_child = Treat::Entities::Token.from_string(val)
|
||||||
|
ruby_child.set :tag, tag
|
||||||
|
ruby_node << ruby_child
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.get_token_list(entity)
|
||||||
|
list = StanfordCoreNLP::ArrayList.new
|
||||||
|
entity.tokens.each do |token|
|
||||||
|
list.add(StanfordCoreNLP::Word.new(token.to_s))
|
||||||
|
end
|
||||||
|
list
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -13,7 +13,7 @@ class Treat::Workers::Processors::Segmenters::Punkt
|
||||||
silence_warnings { require 'punkt-segmenter' }
|
silence_warnings { require 'punkt-segmenter' }
|
||||||
|
|
||||||
# Require the YAML parser.
|
# Require the YAML parser.
|
||||||
silence_warnings { require 'psych' }
|
# silence_warnings { require 'psych' }
|
||||||
|
|
||||||
# Hold one copy of the segmenter per language.
|
# Hold one copy of the segmenter per language.
|
||||||
@@segmenters = {}
|
@@segmenters = {}
|
||||||
|
@ -87,7 +87,7 @@ class Treat::Workers::Processors::Segmenters::Punkt
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
t = ::Psych.load(File.read(model))
|
t = ::YAML.load(File.read(model))
|
||||||
|
|
||||||
@@segmenters[lang] =
|
@@segmenters[lang] =
|
||||||
::Punkt::SentenceTokenizer.new(t)
|
::Punkt::SentenceTokenizer.new(t)
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue