Compare commits
No commits in common. "master" and "mongo-data-sets" have entirely different histories.
master
...
mongo-data
|
@ -12,6 +12,4 @@
|
||||||
*.html
|
*.html
|
||||||
*.yaml
|
*.yaml
|
||||||
spec/sandbox.rb
|
spec/sandbox.rb
|
||||||
coverage/*
|
|
||||||
benchmark/*
|
|
||||||
TODO
|
TODO
|
15
.travis.yml
15
.travis.yml
|
@ -1,18 +1,11 @@
|
||||||
language: ruby
|
language: ruby
|
||||||
|
|
||||||
rvm:
|
rvm:
|
||||||
- 1.9.2
|
- 1.9.2
|
||||||
- 1.9.3
|
- 1.9.3
|
||||||
- 2.0
|
|
||||||
- 2.1
|
|
||||||
- 2.2
|
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- export "JAVA_HOME=/usr/lib/jvm/java-6-openjdk-i386/"
|
- export "JAVA_HOME=/usr/lib/jvm/java-6-openjdk/"
|
||||||
|
before_script:
|
||||||
before_script:
|
|
||||||
- sudo apt-get install antiword
|
- sudo apt-get install antiword
|
||||||
- sudo apt-get install poppler-utils
|
- sudo apt-get install poppler-utils
|
||||||
- rake treat:install[travis] --trace
|
- rake treat:install[travis]
|
||||||
|
script: rake treat:spec
|
||||||
script: rake treat:spec --trace
|
|
35
.treat
35
.treat
|
@ -1,35 +0,0 @@
|
||||||
# A boolean value indicating whether to silence
|
|
||||||
# the output of external libraries (e.g. Stanford
|
|
||||||
# tools, Enju, LDA, Ruby-FANN, Schiphol).
|
|
||||||
Treat.core.verbosity.silence = false
|
|
||||||
|
|
||||||
# A boolean value indicating whether to explain
|
|
||||||
# the steps that Treat is performing.
|
|
||||||
Treat.core.verbosity.debug = true
|
|
||||||
|
|
||||||
# A boolean value indicating whether Treat should
|
|
||||||
# try to detect the language of newly input text.
|
|
||||||
Treat.core.language.detect = false
|
|
||||||
|
|
||||||
# A string representing the language to default
|
|
||||||
# to when detection is off.
|
|
||||||
Treat.core.language.default = 'english'
|
|
||||||
|
|
||||||
# A symbol representing the finest level at which
|
|
||||||
# language detection should be performed if language
|
|
||||||
# detection is turned on.
|
|
||||||
Treat.core.language.detect_at = :document
|
|
||||||
|
|
||||||
# The directory containing executables and JAR files.
|
|
||||||
Treat.paths.bin = '##_INSTALLER_BIN_PATH_##'
|
|
||||||
|
|
||||||
# The directory containing trained models
|
|
||||||
Treat.paths.models = '##_INSTALLER_MODELS_PATH_##'
|
|
||||||
|
|
||||||
# Mongo database configuration.
|
|
||||||
Treat.databases.mongo.db = 'your_database'
|
|
||||||
Treat.databases.mongo.host = 'localhost'
|
|
||||||
Treat.databases.mongo.port = '27017'
|
|
||||||
|
|
||||||
# Include the DSL by default.
|
|
||||||
include Treat::Core::DSL
|
|
39
Gemfile
39
Gemfile
|
@ -1,45 +1,12 @@
|
||||||
source 'https://rubygems.org'
|
source :rubygems
|
||||||
|
|
||||||
gemspec
|
gemspec
|
||||||
|
|
||||||
gem 'birch'
|
gem 'birch'
|
||||||
gem 'schiphol'
|
gem 'schiphol'
|
||||||
gem 'yomu'
|
gem 'sourcify'
|
||||||
gem 'ruby-readability'
|
|
||||||
gem 'nokogiri'
|
|
||||||
|
|
||||||
group :test do
|
group :test do
|
||||||
gem 'rspec'
|
gem 'rspec'
|
||||||
gem 'rake'
|
gem 'rake'
|
||||||
gem 'terminal-table'
|
end
|
||||||
gem 'simplecov'
|
|
||||||
end
|
|
||||||
|
|
||||||
=begin
|
|
||||||
gem 'linguistics'
|
|
||||||
gem 'engtagger'
|
|
||||||
gem 'open-nlp'
|
|
||||||
gem 'stanford-core-nlp'
|
|
||||||
gem 'rwordnet'
|
|
||||||
gem 'scalpel'
|
|
||||||
gem 'fastimage'
|
|
||||||
gem 'decisiontree'
|
|
||||||
gem 'whatlanguage'
|
|
||||||
gem 'zip'
|
|
||||||
gem 'nickel'
|
|
||||||
gem 'tactful_tokenizer'
|
|
||||||
gem 'srx-english'
|
|
||||||
gem 'punkt-segmenter'
|
|
||||||
gem 'chronic'
|
|
||||||
gem 'uea-stemmer'
|
|
||||||
gem 'rbtagger'
|
|
||||||
gem 'ruby-stemmer'
|
|
||||||
gem 'activesupport'
|
|
||||||
gem 'rb-libsvm'
|
|
||||||
gem 'tomz-liblinear-ruby-swig'
|
|
||||||
gem 'ruby-fann'
|
|
||||||
gem 'fuzzy-string-match'
|
|
||||||
gem 'levenshtein-ffi'
|
|
||||||
gem 'tf-idf-similarity'
|
|
||||||
gem 'kronic'
|
|
||||||
=end
|
|
4
LICENSE
4
LICENSE
|
@ -1,4 +1,4 @@
|
||||||
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 2.0.0
|
Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.2
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -15,7 +15,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
Author: Louis-Antoine Mullie (louis.mullie@gmail.com). Copyright 2011-12.
|
||||||
|
|
||||||
A non-trivial amount of code has been incorporated and modified from other libraries:
|
Non-trivial amount of code has been incorporated and modified from other libraries:
|
||||||
|
|
||||||
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
||||||
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
||||||
|
|
51
README.md
51
README.md
|
@ -1,43 +1,34 @@
|
||||||
[](http://travis-ci.org/#!/louismullie/treat)
|
[](http://travis-ci.org/#!/louismullie/treat)
|
||||||
[](https://codeclimate.com/github/louismullie/treat)
|
[](https://gemnasium.com/louismullie/treat)
|
||||||
|
|
||||||
|
Treat is a framework for natural language processing and computational linguistics in Ruby. It provides a common API for a number of gems and external libraries for document retrieval, parsing, annotation, and information extraction.
|
||||||
|
|
||||||

|
**Current features**
|
||||||
|
|
||||||
**New in v2.0.5: [OpenNLP integration](https://github.com/louismullie/treat/commit/727a307af0c64747619531c3aa355535edbf4632) and [Yomu support](https://github.com/louismullie/treat/commit/e483b764e4847e48b39e91a77af8a8baa1a1d056)**
|
|
||||||
|
|
||||||
Treat is a toolkit for natural language processing and computational linguistics in Ruby. The Treat project aims to build a language- and algorithm- agnostic NLP framework for Ruby with support for tasks such as document retrieval, text chunking, segmentation and tokenization, natural language parsing, part-of-speech tagging, keyword extraction and named entity recognition. Learn more by taking a [quick tour](https://github.com/louismullie/treat/wiki/Quick-Tour) or by reading the [manual](https://github.com/louismullie/treat/wiki/Manual).
|
|
||||||
|
|
||||||
**Features**
|
|
||||||
|
|
||||||
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
|
* Text extractors for PDF, HTML, XML, Word, AbiWord, OpenOffice and image formats (Ocropus).
|
||||||
* Text chunkers, sentence segmenters, tokenizers, and parsers (Stanford & Enju).
|
* Text retrieval with indexation and full-text search (Ferret).
|
||||||
* Lexical resources (WordNet interface, several POS taggers for English).
|
* Text chunkers, sentence segmenters, tokenizers, and parsers for several languages (Stanford & Enju).
|
||||||
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
|
||||||
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
* Word inflectors, including stemmers, conjugators, declensors, and number inflection.
|
||||||
* Serialization of annotated entities to YAML, XML or to MongoDB.
|
* Lexical resources (WordNet interface, several POS taggers for English, Stanford taggers for several languages).
|
||||||
|
* Language, date/time, topic words (LDA) and keyword (TF*IDF) extraction.
|
||||||
|
* Serialization of annotated entities to YAML, XML formats or to MongoDB.
|
||||||
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
|
* Visualization in ASCII tree, directed graph (DOT) and tag-bracketed (standoff) formats.
|
||||||
* Linguistic resources, including language detection and tag alignments for several treebanks.
|
* Linguistic resources, including language detection and tag alignments for several treebanks.
|
||||||
* Machine learning (decision tree, multilayer perceptron, LIBLINEAR, LIBSVM).
|
* Decision tree and multilayer perceptron classification (liblinear coming soon!)
|
||||||
* Text retrieval with indexation and full-text search (Ferret).
|
|
||||||
|
|
||||||
**Contributing**
|
<br>
|
||||||
|
|
||||||
I am actively seeking developers that can help maintain and expand this project. You can find a list of ideas for contributing to the project [here](https://github.com/louismullie/treat/wiki/Contributing).
|
**Resources**
|
||||||
|
|
||||||
**Authors**
|
* Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
|
||||||
|
* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
|
||||||
Lead developper: @louismullie [[Twitter](https://twitter.com/LouisMullie)]
|
* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
|
||||||
|
* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
|
||||||
Contributors:
|
* View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
|
||||||
|
* Open an [issue](https://github.com/louismullie/treat/issues).
|
||||||
- @bdigital
|
|
||||||
- @automatedtendencies
|
<br>
|
||||||
- @LeFnord
|
|
||||||
- @darkphantum
|
|
||||||
- @whistlerbrk
|
|
||||||
- @smileart
|
|
||||||
- @erol
|
|
||||||
|
|
||||||
**License**
|
**License**
|
||||||
|
|
||||||
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
This software is released under the [GPL License](https://github.com/louismullie/treat/wiki/License-Information) and includes software released under the GPL, Ruby, Apache 2.0 and MIT licenses.
|
14
RELEASE
14
RELEASE
|
@ -41,16 +41,4 @@ Treat - Text Retrieval, Extraction and Annotation Toolkit
|
||||||
1.1.0
|
1.1.0
|
||||||
|
|
||||||
* Complete refactoring of the core of the library.
|
* Complete refactoring of the core of the library.
|
||||||
* Separated all configuration stuff from dynamic stuff.
|
* Separated all configuration stuff from dynamic stuff.
|
||||||
|
|
||||||
1.2.0
|
|
||||||
|
|
||||||
* Added LIBSVM and LIBLINEAR classifier support.
|
|
||||||
* Added support for serialization of documents and data sets to MongoDB.
|
|
||||||
* Added specs for most of the core classes.
|
|
||||||
* Several bug fixes.
|
|
||||||
|
|
||||||
2.0.0rc1
|
|
||||||
|
|
||||||
* MAJOR CHANGE: the old DSL is no longer supported. A new DSL style using
|
|
||||||
lowercase keywords is now used and must be required explicitly.
|
|
58
Rakefile
58
Rakefile
|
@ -1,47 +1,29 @@
|
||||||
# All commands are prefixed with "treat:".
|
require 'date'
|
||||||
|
require 'rspec/core/rake_task'
|
||||||
|
|
||||||
|
task :default => :spec
|
||||||
|
|
||||||
namespace :treat do
|
namespace :treat do
|
||||||
|
|
||||||
# Require the Treat library.
|
RSpec::Core::RakeTask.new do |t|
|
||||||
require_relative 'lib/treat'
|
task = ARGV[0].scan(/\[([a-z]*)\]/)
|
||||||
|
if task && task.size == 0
|
||||||
# Sandbox a script, for development.
|
t.pattern = "./spec/*.rb"
|
||||||
# Syntax: rake treat:sandbox
|
else
|
||||||
task :sandbox do
|
t.pattern = "./spec/#{task[0][0]}.rb"
|
||||||
require_relative 'spec/sandbox'
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Prints the current version of Treat.
|
|
||||||
# Syntax: rake treat:version
|
|
||||||
task :version do
|
task :version do
|
||||||
puts Treat::VERSION
|
vpath = '../lib/treat/version.rb'
|
||||||
|
vfile = File.expand_path(vpath, __FILE__)
|
||||||
|
contents = File.read(vfile)
|
||||||
|
puts contents[/VERSION = "([^"]+)"/, 1]
|
||||||
end
|
end
|
||||||
|
|
||||||
# Installs a language pack (default to english).
|
|
||||||
# A language pack is a set of gems, binaries and
|
|
||||||
# model files that support the various workers
|
|
||||||
# that are available for that particular language.
|
|
||||||
# Syntax: rake treat:install (installs english)
|
|
||||||
# - OR - rake treast:install[some_language]
|
|
||||||
task :install, [:language] do |t, args|
|
task :install, [:language] do |t, args|
|
||||||
language = args.language || 'english'
|
require './lib/treat'
|
||||||
Treat::Core::Installer.install(language)
|
Treat.install(args.language || 'english')
|
||||||
end
|
|
||||||
|
|
||||||
# Runs 1) the core library specs and 2) the
|
|
||||||
# worker specs for a) all languages (default)
|
|
||||||
# or b) a specific language (if specified).
|
|
||||||
# Also outputs the coverage for the whole
|
|
||||||
# library to treat/coverage (using SimpleCov).
|
|
||||||
# N.B. the worker specs are dynamically defined
|
|
||||||
# following the examples found in spec/workers.
|
|
||||||
# (see /spec/language/workers for more info)
|
|
||||||
# Syntax: rake treat:spec (core + all langs)
|
|
||||||
# - OR - rake treat:spec[some_language]
|
|
||||||
task :spec, [:language] do |t, args|
|
|
||||||
require_relative 'spec/helper'
|
|
||||||
Treat::Specs::Helper.start_coverage
|
|
||||||
Treat::Specs::Helper.run_library_specs
|
|
||||||
Treat::Specs::Helper.run_language_specs(args.language)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
51
lib/treat.rb
51
lib/treat.rb
|
@ -1,23 +1,36 @@
|
||||||
# Treat is a toolkit for natural language
|
|
||||||
# processing and computational linguistics
|
|
||||||
# in Ruby. The Treat project aims to build
|
|
||||||
# a language- and algorithm- agnostic NLP
|
|
||||||
# framework for Ruby with support for tasks
|
|
||||||
# such as document retrieval, text chunking,
|
|
||||||
# segmentation and tokenization, natural
|
|
||||||
# language parsing, part-of-speech tagging,
|
|
||||||
# keyword mining and named entity recognition.
|
|
||||||
#
|
|
||||||
# Author: Louis-Antoine Mullie (c) 2010-12.
|
|
||||||
#
|
|
||||||
# Released under the General Public License.
|
|
||||||
module Treat
|
module Treat
|
||||||
|
|
||||||
# * Load all the core classes. * #
|
# Treat requires Ruby >= 1.9.2
|
||||||
require_relative 'treat/version'
|
if RUBY_VERSION < '1.9.2'
|
||||||
require_relative 'treat/exception'
|
raise "Treat requires Ruby version 1.9.2 " +
|
||||||
require_relative 'treat/autoload'
|
"or higher, but current is #{RUBY_VERSION}."
|
||||||
require_relative 'treat/modules'
|
end
|
||||||
require_relative 'treat/builder'
|
|
||||||
|
# Custom exception class.
|
||||||
|
class Exception < ::Exception; end
|
||||||
|
|
||||||
|
# Load configuration options.
|
||||||
|
require 'treat/config'
|
||||||
|
# Load all workers.
|
||||||
|
require 'treat/helpers'
|
||||||
|
# Require library loaders.
|
||||||
|
require 'treat/loaders'
|
||||||
|
# Require all core classes.
|
||||||
|
require 'treat/core'
|
||||||
|
# Require all entity classes.
|
||||||
|
require 'treat/entities'
|
||||||
|
# Lazy load worker classes.
|
||||||
|
require 'treat/workers'
|
||||||
|
# Require proxies last.
|
||||||
|
require 'treat/proxies'
|
||||||
|
|
||||||
|
# Turn sugar on.
|
||||||
|
Treat::Config.sweeten!
|
||||||
|
|
||||||
|
# Install packages for a given language.
|
||||||
|
def self.install(language = :english)
|
||||||
|
require 'treat/installer'
|
||||||
|
Treat::Installer.install(language)
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,44 +0,0 @@
|
||||||
# Basic mixin for all the main modules;
|
|
||||||
# takes care of requiring the right files
|
|
||||||
# in the right order for each one.
|
|
||||||
#
|
|
||||||
# If a module's folder (e.g. /entities)
|
|
||||||
# contains a file with a corresponding
|
|
||||||
# singular name (e.g. /entity), that
|
|
||||||
# base class is required first. Then,
|
|
||||||
# all the files that are found directly
|
|
||||||
# under that folder are required (but
|
|
||||||
# not those found in sub-folders).
|
|
||||||
module Treat::Autoload
|
|
||||||
|
|
||||||
# Loads all the files for the base
|
|
||||||
# module in the appropriate order.
|
|
||||||
def self.included(base)
|
|
||||||
m = self.get_module_name(base)
|
|
||||||
d = self.get_module_path(m)
|
|
||||||
n = self.singularize(m) + '.rb'
|
|
||||||
f, p = File.join(d, n), "#{d}/*.rb"
|
|
||||||
require f if File.readable?(f)
|
|
||||||
Dir.glob(p).each { |f| require f }
|
|
||||||
end
|
|
||||||
|
|
||||||
# Returns the path to a module's dir.
|
|
||||||
def self.get_module_path(name)
|
|
||||||
file = File.expand_path(__FILE__)
|
|
||||||
dirs = File.dirname(file).split('/')
|
|
||||||
File.join(*dirs[0..-1], name)
|
|
||||||
end
|
|
||||||
|
|
||||||
# Return the downcased form of the
|
|
||||||
# module's last name (e.g. "entities").
|
|
||||||
def self.get_module_name(mod)
|
|
||||||
mod.to_s.split('::')[-1].downcase
|
|
||||||
end
|
|
||||||
|
|
||||||
# Helper method to singularize words.
|
|
||||||
def self.singularize(w)
|
|
||||||
if w[-3..-1] == 'ies'; w[0..-4] + 'y'
|
|
||||||
else; (w[-1] == 's' ? w[0..-2] : w); end
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -1,6 +0,0 @@
|
||||||
class Treat::Builder
|
|
||||||
include Treat::Core::DSL
|
|
||||||
def initialize(&block)
|
|
||||||
instance_exec(&block)
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -0,0 +1,135 @@
|
||||||
|
module Treat::Config
|
||||||
|
|
||||||
|
Paths = [ :tmp, :lib, :bin,
|
||||||
|
:files, :data, :models, :spec ]
|
||||||
|
|
||||||
|
class << self
|
||||||
|
attr_accessor :config
|
||||||
|
end
|
||||||
|
|
||||||
|
Treat.module_eval do
|
||||||
|
# Handle all missing methods as conf options.
|
||||||
|
def self.method_missing(sym, *args, &block)
|
||||||
|
super(sym, *args, &block) if sym == :to_ary
|
||||||
|
Treat::Config.config[sym]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.configure
|
||||||
|
# Temporary configuration hash.
|
||||||
|
config = { paths: {} }
|
||||||
|
confdir = get_full_path(:lib) + 'treat/config'
|
||||||
|
# Iterate over each directory in the config.
|
||||||
|
Dir[confdir + '/*'].each do |dir|
|
||||||
|
name = File.basename(dir, '.*').intern
|
||||||
|
config[name] = {}
|
||||||
|
# Iterate over each file in the directory.
|
||||||
|
Dir[confdir + "/#{name}/*.rb"].each do |file|
|
||||||
|
key = File.basename(file, '.*').intern
|
||||||
|
config[name][key] = eval(File.read(file))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
# Get the path config.
|
||||||
|
Paths.each do |path|
|
||||||
|
config[:paths][path] = get_full_path(path)
|
||||||
|
end
|
||||||
|
# Get the tag alignments.
|
||||||
|
configure_tags!(config[:tags][:aligned])
|
||||||
|
# Convert hash to structs.
|
||||||
|
self.config = self.hash_to_struct(config)
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.get_full_path(dir)
|
||||||
|
File.dirname(__FILE__) +
|
||||||
|
'/../../' + dir.to_s + "/"
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.configure_tags!(config)
|
||||||
|
ts = config[:tag_sets]
|
||||||
|
config[:word_tags_to_category] =
|
||||||
|
align_tags(config[:word_tags], ts)
|
||||||
|
config[:phrase_tags_to_category] =
|
||||||
|
align_tags(config[:phrase_tags], ts)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Align tag configuration.
|
||||||
|
def self.align_tags(tags, tag_sets)
|
||||||
|
wttc = {}
|
||||||
|
tags.each_slice(2) do |desc, tags|
|
||||||
|
category = desc.gsub(',', ' ,').
|
||||||
|
split(' ')[0].downcase
|
||||||
|
tag_sets.each_with_index do |tag_set, i|
|
||||||
|
next unless tags[i]
|
||||||
|
wttc[tags[i]] ||= {}
|
||||||
|
wttc[tags[i]][tag_set] = category
|
||||||
|
end
|
||||||
|
end
|
||||||
|
wttc
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.hash_to_struct(hash)
|
||||||
|
return hash if hash.keys.
|
||||||
|
select { |k| !k.is_a?(Symbol) }.size > 0
|
||||||
|
struct = Struct.new(
|
||||||
|
*hash.keys).new(*hash.values)
|
||||||
|
hash.each do |key, value|
|
||||||
|
if value.is_a?(Hash)
|
||||||
|
struct[key] =
|
||||||
|
self.hash_to_struct(value)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
struct
|
||||||
|
end
|
||||||
|
|
||||||
|
# Turn on syntactic sugar.
|
||||||
|
def self.sweeten!
|
||||||
|
|
||||||
|
# Undo this in unsweeten! - # Fix
|
||||||
|
Treat::Entities.module_eval do
|
||||||
|
self.constants.each do |type|
|
||||||
|
define_singleton_method(type) do |value='', id=nil|
|
||||||
|
const_get(type).build(value, id)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return if Treat.core.syntax.sweetened
|
||||||
|
Treat.core.syntax.sweetened = true
|
||||||
|
Treat.core.entities.list.each do |type|
|
||||||
|
next if type == :Symbol
|
||||||
|
kname = cc(type).intern
|
||||||
|
klass = Treat::Entities.const_get(kname)
|
||||||
|
Object.class_eval do
|
||||||
|
define_method(kname) do |val, opts={}|
|
||||||
|
klass.build(val, opts)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Treat::Core.constants.each do |kname|
|
||||||
|
Object.class_eval do
|
||||||
|
klass = Treat::Core.const_get(kname)
|
||||||
|
define_method(kname) do |*args|
|
||||||
|
klass.new(*args)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
# Turn off syntactic sugar.
|
||||||
|
def self.unsweeten!
|
||||||
|
return unless Treat.core.syntax.sweetened
|
||||||
|
Treat.core.syntax.sweetened = false
|
||||||
|
Treat.core.entities.list.each do |type|
|
||||||
|
name = cc(type).intern
|
||||||
|
next if type == :Symbol
|
||||||
|
Object.class_eval { remove_method(name) }
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
# Run all configuration.
|
||||||
|
self.configure
|
||||||
|
|
||||||
|
end
|
|
@ -1,38 +0,0 @@
|
||||||
# This module uses structs to represent the
|
|
||||||
# configuration options that are stored in
|
|
||||||
# the /config folder.
|
|
||||||
module Treat::Config
|
|
||||||
|
|
||||||
# Require configurable mix in.
|
|
||||||
require_relative 'importable'
|
|
||||||
|
|
||||||
# Make all configuration importable.
|
|
||||||
extend Treat::Config::Importable
|
|
||||||
|
|
||||||
# Core configuration options for entities.
|
|
||||||
class Treat::Config::Entities; end
|
|
||||||
|
|
||||||
# Configuration for paths to models, binaries,
|
|
||||||
# temporary storage and file downloads.
|
|
||||||
class Treat::Config::Paths; end
|
|
||||||
|
|
||||||
# Configuration for all Treat workers.
|
|
||||||
class Treat::Config::Workers; end
|
|
||||||
|
|
||||||
# Helpful linguistic options.
|
|
||||||
class Treat::Config::Linguistics; end
|
|
||||||
|
|
||||||
# Supported workers for each language.
|
|
||||||
class Treat::Config::Languages; end
|
|
||||||
|
|
||||||
# Configuration options for external libraries.
|
|
||||||
class Treat::Config::Libraries; end
|
|
||||||
|
|
||||||
# Configuration options for database
|
|
||||||
# connectivity (host, port, etc.)
|
|
||||||
class Treat::Config::Databases; end
|
|
||||||
|
|
||||||
# Configuration options for Treat core.
|
|
||||||
class Treat::Config::Core; end
|
|
||||||
|
|
||||||
end
|
|
|
@ -1,51 +0,0 @@
|
||||||
# Provide default functionality to load configuration
|
|
||||||
# options from flat files into their respective modules.
|
|
||||||
module Treat::Config::Configurable
|
|
||||||
|
|
||||||
# When extended, add the .config property to
|
|
||||||
# the class that is being operated on.
|
|
||||||
def self.extended(base)
|
|
||||||
class << base; attr_accessor :config; end
|
|
||||||
base.class_eval { self.config = {} }
|
|
||||||
end
|
|
||||||
|
|
||||||
# Provide base functionality to configure
|
|
||||||
# all modules. The behaviour is as follows:
|
|
||||||
#
|
|
||||||
# 1 - Check if a file named data/$CLASS$.rb
|
|
||||||
# exists; if so, load that file as the base
|
|
||||||
# configuration, i.e. "Treat.$CLASS$"; e.g.
|
|
||||||
# "Treat.core"
|
|
||||||
#
|
|
||||||
# 2 - Check if a folder named data/$CLASS$
|
|
||||||
# exists; if so, load each file in that folder
|
|
||||||
# as a suboption of the main configuration,
|
|
||||||
# i.e. "Treat.$CLASS$.$FILE$"; e.g. "Treat.workers"
|
|
||||||
#
|
|
||||||
# (where $CLASS$ is the lowercase name of
|
|
||||||
# the concrete class being extended by this.)
|
|
||||||
def configure!
|
|
||||||
path = File.dirname(File.expand_path( # FIXME
|
|
||||||
__FILE__)).split('/')[0..-4].join('/') + '/'
|
|
||||||
main_dir = path + 'lib/treat/config/data/'
|
|
||||||
mod_name = self.name.split('::')[-1].downcase
|
|
||||||
conf_dir = main_dir + mod_name
|
|
||||||
base_file = main_dir + mod_name + '.rb'
|
|
||||||
if File.readable?(base_file)
|
|
||||||
self.config = eval(File.read(base_file))
|
|
||||||
elsif FileTest.directory?(conf_dir)
|
|
||||||
self.config = self.from_dir(conf_dir)
|
|
||||||
else; raise Treat::Exception,
|
|
||||||
"No config file found for #{mod_name}."
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# * Helper methods for configuraton * #
|
|
||||||
def from_dir(conf_dir)
|
|
||||||
Hash[Dir[conf_dir + '/*'].map do |path|
|
|
||||||
name = File.basename(path, '.*').intern
|
|
||||||
[name, eval(File.read(path))]
|
|
||||||
end]
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
['xml', 'html', 'txt', 'odt',
|
||||||
|
'abw', 'doc', 'yaml', 'uea',
|
||||||
|
'lda', 'pdf', 'ptb', 'dot',
|
||||||
|
'ai', 'id3', 'svo', 'mlp' ]
|
|
@ -0,0 +1,8 @@
|
||||||
|
{language_to_code: {
|
||||||
|
arabic: 'UTF-8',
|
||||||
|
chinese: 'GB18030',
|
||||||
|
english: 'UTF-8',
|
||||||
|
french: 'ISO_8859-1',
|
||||||
|
ferman: 'ISO_8859-1',
|
||||||
|
hebrew: 'UTF-8'
|
||||||
|
}}
|
|
@ -0,0 +1,2 @@
|
||||||
|
{list: [:entity, :unknown, :email, :url, :symbol, :sentence, :punctuation, :number, :enclitic, :word, :token, :fragment, :phrase, :paragraph, :title, :zone, :list, :block, :page, :section, :collection, :document],
|
||||||
|
order: [:token, :fragment, :phrase, :sentence, :zone, :section, :document, :collection]}
|
|
@ -0,0 +1,3 @@
|
||||||
|
{default: :english,
|
||||||
|
detect: false,
|
||||||
|
detect_at: :document}
|
|
@ -0,0 +1,8 @@
|
||||||
|
{description: {
|
||||||
|
:tmp => 'temporary files',
|
||||||
|
:lib => 'class and module definitions',
|
||||||
|
:bin => 'binary files',
|
||||||
|
:files => 'user-saved files',
|
||||||
|
:models => 'model files',
|
||||||
|
:spec => 'spec test files'
|
||||||
|
}}
|
|
@ -0,0 +1 @@
|
||||||
|
{sweetened: false}
|
|
@ -0,0 +1 @@
|
||||||
|
{debug: false, silence: true}
|
|
@ -1,54 +0,0 @@
|
||||||
{
|
|
||||||
acronyms:
|
|
||||||
['xml', 'html', 'txt', 'odt',
|
|
||||||
'abw', 'doc', 'yaml', 'uea',
|
|
||||||
'lda', 'pdf', 'ptb', 'dot',
|
|
||||||
'ai', 'id3', 'svo', 'mlp',
|
|
||||||
'svm', 'srx', 'nlp'],
|
|
||||||
|
|
||||||
encodings:
|
|
||||||
{language_to_code: {
|
|
||||||
arabic: 'UTF-8',
|
|
||||||
chinese: 'GB18030',
|
|
||||||
english: 'UTF-8',
|
|
||||||
french: 'ISO_8859-1',
|
|
||||||
ferman: 'ISO_8859-1',
|
|
||||||
hebrew: 'UTF-8'
|
|
||||||
}},
|
|
||||||
|
|
||||||
entities:
|
|
||||||
{list:
|
|
||||||
[:entity, :unknown, :email,
|
|
||||||
:url, :symbol, :sentence,
|
|
||||||
:punctuation, :number,
|
|
||||||
:enclitic, :word, :token, :group,
|
|
||||||
:fragment, :phrase, :paragraph,
|
|
||||||
:title, :zone, :list, :block,
|
|
||||||
:page, :section, :collection,
|
|
||||||
:document],
|
|
||||||
order:
|
|
||||||
[:token, :fragment, :group,
|
|
||||||
:sentence, :zone, :section,
|
|
||||||
:document, :collection]},
|
|
||||||
language: {
|
|
||||||
default: :english,
|
|
||||||
detect: false,
|
|
||||||
detect_at: :document
|
|
||||||
},
|
|
||||||
paths: {
|
|
||||||
description: {
|
|
||||||
tmp: 'temporary files',
|
|
||||||
lib: 'class and module definitions',
|
|
||||||
bin: 'binary files',
|
|
||||||
files: 'user-saved files',
|
|
||||||
models: 'model files',
|
|
||||||
spec: 'spec test files'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
learning: {
|
|
||||||
list: [:data_set, :export, :feature, :tag, :problem, :question]
|
|
||||||
},
|
|
||||||
syntax: { sweetened: false },
|
|
||||||
|
|
||||||
verbosity: { debug: false, silence: true}
|
|
||||||
}
|
|
|
@ -1,10 +0,0 @@
|
||||||
{
|
|
||||||
default: {
|
|
||||||
adapter: :mongo
|
|
||||||
},
|
|
||||||
mongo: {
|
|
||||||
host: 'localhost',
|
|
||||||
port: '27017',
|
|
||||||
db: nil
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,15 +0,0 @@
|
||||||
{
|
|
||||||
list:
|
|
||||||
[:entity, :unknown, :email,
|
|
||||||
:url, :symbol, :sentence,
|
|
||||||
:punctuation, :number,
|
|
||||||
:enclitic, :word, :token,
|
|
||||||
:fragment, :phrase, :paragraph,
|
|
||||||
:title, :zone, :list, :block,
|
|
||||||
:page, :section, :collection,
|
|
||||||
:document],
|
|
||||||
order:
|
|
||||||
[:token, :fragment, :phrase,
|
|
||||||
:sentence, :zone, :section,
|
|
||||||
:document, :collection]
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'ferret', 'bson_ext', 'mongo', 'lda-ruby',
|
|
||||||
'stanford-core-nlp', 'linguistics',
|
|
||||||
'ruby-readability', 'whatlanguage',
|
|
||||||
'chronic', 'kronic', 'nickel', 'decisiontree',
|
|
||||||
'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
|
|
||||||
'tf-idf-similarity', 'narray', 'fastimage',
|
|
||||||
'fuzzy-string-match', 'levenshtein-ffi'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
learners: {
|
|
||||||
classifiers: [:id3, :linear, :mlp, :svm]
|
|
||||||
},
|
|
||||||
extractors: {
|
|
||||||
keywords: [:tf_idf],
|
|
||||||
language: [:what_language],
|
|
||||||
topic_words: [:lda],
|
|
||||||
tf_idf: [:native],
|
|
||||||
distance: [:levenshtein],
|
|
||||||
similarity: [:jaro_winkler, :tf_idf]
|
|
||||||
},
|
|
||||||
formatters: {
|
|
||||||
serializers: [:xml, :yaml, :mongo],
|
|
||||||
unserializers: [:xml, :yaml, :mongo],
|
|
||||||
visualizers: [:dot, :standoff, :tree]
|
|
||||||
},
|
|
||||||
retrievers: {
|
|
||||||
searchers: [:ferret],
|
|
||||||
indexers: [:ferret]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,95 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'rbtagger',
|
|
||||||
'ruby-stemmer',
|
|
||||||
'punkt-segmenter',
|
|
||||||
'tactful_tokenizer',
|
|
||||||
'nickel',
|
|
||||||
'rwordnet',
|
|
||||||
'uea-stemmer',
|
|
||||||
'engtagger',
|
|
||||||
'activesupport',
|
|
||||||
'srx-english',
|
|
||||||
'scalpel'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
extractors: {
|
|
||||||
time: [:chronic, :kronic, :ruby, :nickel],
|
|
||||||
topics: [:reuters],
|
|
||||||
name_tag: [:stanford]
|
|
||||||
},
|
|
||||||
inflectors: {
|
|
||||||
conjugators: [:linguistics],
|
|
||||||
declensors: [:english, :linguistics],
|
|
||||||
stemmers: [:porter, :porter_c, :uea],
|
|
||||||
ordinalizers: [:linguistics],
|
|
||||||
cardinalizers: [:linguistics]
|
|
||||||
},
|
|
||||||
lexicalizers: {
|
|
||||||
taggers: [:lingua, :brill, :stanford],
|
|
||||||
sensers: [:wordnet],
|
|
||||||
categorizers: [:from_tag]
|
|
||||||
},
|
|
||||||
processors: {
|
|
||||||
parsers: [:stanford],
|
|
||||||
segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
|
|
||||||
tokenizers: [:ptb, :stanford, :punkt, :open_nlp]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
stop_words:
|
|
||||||
[
|
|
||||||
"about",
|
|
||||||
"also",
|
|
||||||
"are",
|
|
||||||
"away",
|
|
||||||
"because",
|
|
||||||
"been",
|
|
||||||
"beside",
|
|
||||||
"besides",
|
|
||||||
"between",
|
|
||||||
"but",
|
|
||||||
"cannot",
|
|
||||||
"could",
|
|
||||||
"did",
|
|
||||||
"etc",
|
|
||||||
"even",
|
|
||||||
"ever",
|
|
||||||
"every",
|
|
||||||
"for",
|
|
||||||
"had",
|
|
||||||
"have",
|
|
||||||
"how",
|
|
||||||
"into",
|
|
||||||
"isn",
|
|
||||||
"maybe",
|
|
||||||
"non",
|
|
||||||
"nor",
|
|
||||||
"now",
|
|
||||||
"should",
|
|
||||||
"such",
|
|
||||||
"than",
|
|
||||||
"that",
|
|
||||||
"then",
|
|
||||||
"these",
|
|
||||||
"this",
|
|
||||||
"those",
|
|
||||||
"though",
|
|
||||||
"too",
|
|
||||||
"was",
|
|
||||||
"wasn",
|
|
||||||
"were",
|
|
||||||
"what",
|
|
||||||
"when",
|
|
||||||
"where",
|
|
||||||
"which",
|
|
||||||
"while",
|
|
||||||
"who",
|
|
||||||
"whom",
|
|
||||||
"whose",
|
|
||||||
"will",
|
|
||||||
"with",
|
|
||||||
"would",
|
|
||||||
"wouldn",
|
|
||||||
"yes"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,148 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'punkt-segmenter',
|
|
||||||
'tactful_tokenizer',
|
|
||||||
'stanford-core-nlp'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
processors: {
|
|
||||||
segmenters: [:scalpel],
|
|
||||||
tokenizers: [:ptb,:stanford],
|
|
||||||
parsers: [:stanford]
|
|
||||||
},
|
|
||||||
lexicalizers: {
|
|
||||||
taggers: [:stanford],
|
|
||||||
categorizers: [:from_tag]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
stop_words:
|
|
||||||
[
|
|
||||||
"ailleurs",
|
|
||||||
"ainsi",
|
|
||||||
"alors",
|
|
||||||
"aucun",
|
|
||||||
"aucune",
|
|
||||||
"auquel",
|
|
||||||
"aurai",
|
|
||||||
"auras",
|
|
||||||
"aurez",
|
|
||||||
"aurons",
|
|
||||||
"auront",
|
|
||||||
"aussi",
|
|
||||||
"autre",
|
|
||||||
"autres",
|
|
||||||
"aux",
|
|
||||||
"auxquelles",
|
|
||||||
"auxquels",
|
|
||||||
"avaient",
|
|
||||||
"avais",
|
|
||||||
"avait",
|
|
||||||
"avec",
|
|
||||||
"avez",
|
|
||||||
"aviez",
|
|
||||||
"avoir",
|
|
||||||
"avons",
|
|
||||||
"celui",
|
|
||||||
"cependant",
|
|
||||||
"certaine",
|
|
||||||
"certaines",
|
|
||||||
"certains",
|
|
||||||
"ces",
|
|
||||||
"cet",
|
|
||||||
"cette",
|
|
||||||
"ceux",
|
|
||||||
"chacun",
|
|
||||||
"chacune",
|
|
||||||
"chaque",
|
|
||||||
"comme",
|
|
||||||
"constamment",
|
|
||||||
"davantage",
|
|
||||||
"depuis",
|
|
||||||
"des",
|
|
||||||
"desquelles",
|
|
||||||
"desquels",
|
|
||||||
"dessous",
|
|
||||||
"dessus",
|
|
||||||
"donc",
|
|
||||||
"dont",
|
|
||||||
"duquel",
|
|
||||||
"egalement",
|
|
||||||
"elles",
|
|
||||||
"encore",
|
|
||||||
"enfin",
|
|
||||||
"ensuite",
|
|
||||||
"etaient",
|
|
||||||
"etais",
|
|
||||||
"etait",
|
|
||||||
"etes",
|
|
||||||
"etiez",
|
|
||||||
"etions",
|
|
||||||
"etre",
|
|
||||||
"eux",
|
|
||||||
"guere",
|
|
||||||
"ici",
|
|
||||||
"ils",
|
|
||||||
"jamais",
|
|
||||||
"jusqu",
|
|
||||||
"laquelle",
|
|
||||||
"legerement",
|
|
||||||
"lequel",
|
|
||||||
"les",
|
|
||||||
"lesquelles",
|
|
||||||
"lesquels",
|
|
||||||
"leur",
|
|
||||||
"leurs",
|
|
||||||
"lors",
|
|
||||||
"lui",
|
|
||||||
"maintenant",
|
|
||||||
"mais",
|
|
||||||
"malgre",
|
|
||||||
"moi",
|
|
||||||
"moins",
|
|
||||||
"notamment",
|
|
||||||
"parce",
|
|
||||||
"plupart",
|
|
||||||
"pourtant",
|
|
||||||
"presentement",
|
|
||||||
"presque",
|
|
||||||
"puis",
|
|
||||||
"puisque",
|
|
||||||
"quand",
|
|
||||||
"quant",
|
|
||||||
"que",
|
|
||||||
"quel",
|
|
||||||
"quelqu",
|
|
||||||
"quelque",
|
|
||||||
"quelques",
|
|
||||||
"qui",
|
|
||||||
"quoi",
|
|
||||||
"quoique",
|
|
||||||
"rien",
|
|
||||||
"selon",
|
|
||||||
"serai",
|
|
||||||
"seras",
|
|
||||||
"serez",
|
|
||||||
"serons",
|
|
||||||
"seront",
|
|
||||||
"soient",
|
|
||||||
"soit",
|
|
||||||
"sommes",
|
|
||||||
"sont",
|
|
||||||
"sous",
|
|
||||||
"suis",
|
|
||||||
"telle",
|
|
||||||
"telles",
|
|
||||||
"tels",
|
|
||||||
"toi",
|
|
||||||
"toujours",
|
|
||||||
"tout",
|
|
||||||
"toutes",
|
|
||||||
"tres",
|
|
||||||
"trop",
|
|
||||||
"une",
|
|
||||||
"vos",
|
|
||||||
"votre",
|
|
||||||
"vous"
|
|
||||||
]
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,137 +0,0 @@
|
||||||
#encoding: UTF-8
|
|
||||||
|
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'punkt-segmenter',
|
|
||||||
'tactful_tokenizer',
|
|
||||||
'stanford-core-nlp'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
processors: {
|
|
||||||
segmenters: [:tactful, :punkt, :stanford, :scalpel],
|
|
||||||
tokenizers: [:stanford, :punkt],
|
|
||||||
parsers: [:stanford]
|
|
||||||
},
|
|
||||||
lexicalizers: {
|
|
||||||
taggers: [:stanford],
|
|
||||||
categorizers: [:from_tag]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
stop_words:
|
|
||||||
[
|
|
||||||
"alle",
|
|
||||||
"allem",
|
|
||||||
"alles",
|
|
||||||
"andere",
|
|
||||||
"anderem",
|
|
||||||
"anderen",
|
|
||||||
"anderer",
|
|
||||||
"anderes",
|
|
||||||
"auf",
|
|
||||||
"bei",
|
|
||||||
"beim",
|
|
||||||
"bist",
|
|
||||||
"dadurch",
|
|
||||||
"dein",
|
|
||||||
"deine",
|
|
||||||
"deiner",
|
|
||||||
"deines",
|
|
||||||
"deins",
|
|
||||||
"dem",
|
|
||||||
"denen",
|
|
||||||
"der",
|
|
||||||
"deren",
|
|
||||||
"des",
|
|
||||||
"deshalb",
|
|
||||||
"dessen",
|
|
||||||
"diese",
|
|
||||||
"diesem",
|
|
||||||
"diesen",
|
|
||||||
"dieser",
|
|
||||||
"dieses",
|
|
||||||
"ein",
|
|
||||||
"eine",
|
|
||||||
"einem",
|
|
||||||
"einen",
|
|
||||||
"einer",
|
|
||||||
"eines",
|
|
||||||
"euer",
|
|
||||||
"euere",
|
|
||||||
"eueren",
|
|
||||||
"eueres",
|
|
||||||
"für",
|
|
||||||
"haben",
|
|
||||||
"habt",
|
|
||||||
"hatte",
|
|
||||||
"hatten",
|
|
||||||
"hattest",
|
|
||||||
"hattet",
|
|
||||||
"hierzu",
|
|
||||||
"hinter",
|
|
||||||
"ich",
|
|
||||||
"ihr",
|
|
||||||
"ihre",
|
|
||||||
"ihren",
|
|
||||||
"ihrer",
|
|
||||||
"ihres",
|
|
||||||
"indem",
|
|
||||||
"ist",
|
|
||||||
"jede",
|
|
||||||
"jedem",
|
|
||||||
"jeden",
|
|
||||||
"jeder",
|
|
||||||
"jedes",
|
|
||||||
"kann",
|
|
||||||
"kannst",
|
|
||||||
"können",
|
|
||||||
"könnt",
|
|
||||||
"konnte",
|
|
||||||
"konnten",
|
|
||||||
"konntest",
|
|
||||||
"konntet",
|
|
||||||
"mehr",
|
|
||||||
"mein",
|
|
||||||
"meine",
|
|
||||||
"meiner",
|
|
||||||
"meines",
|
|
||||||
"meins",
|
|
||||||
"nach",
|
|
||||||
"neben",
|
|
||||||
"nicht",
|
|
||||||
"nichts",
|
|
||||||
"seid",
|
|
||||||
"sein",
|
|
||||||
"seine",
|
|
||||||
"seiner",
|
|
||||||
"seines",
|
|
||||||
"seins",
|
|
||||||
"sie",
|
|
||||||
"sind",
|
|
||||||
"über",
|
|
||||||
"und",
|
|
||||||
"uns",
|
|
||||||
"unser",
|
|
||||||
"unsere",
|
|
||||||
"unter",
|
|
||||||
"vor",
|
|
||||||
"warst",
|
|
||||||
"weil",
|
|
||||||
"wenn",
|
|
||||||
"werde",
|
|
||||||
"werden",
|
|
||||||
"werdet",
|
|
||||||
"willst",
|
|
||||||
"wir",
|
|
||||||
"wird",
|
|
||||||
"wirst",
|
|
||||||
"wollen",
|
|
||||||
"wollt",
|
|
||||||
"wollte",
|
|
||||||
"wollten",
|
|
||||||
"wolltest",
|
|
||||||
"wolltet",
|
|
||||||
"zum",
|
|
||||||
"zur"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,162 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'punkt-segmenter',
|
|
||||||
'tactful_tokenizer'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
processors: {
|
|
||||||
segmenters: [:punkt],
|
|
||||||
tokenizers: []
|
|
||||||
}
|
|
||||||
},
|
|
||||||
stop_words:
|
|
||||||
[
|
|
||||||
"affinche",
|
|
||||||
"alcun",
|
|
||||||
"alcuna",
|
|
||||||
"alcune",
|
|
||||||
"alcuni",
|
|
||||||
"alcuno",
|
|
||||||
"allora",
|
|
||||||
"altra",
|
|
||||||
"altre",
|
|
||||||
"altri",
|
|
||||||
"altro",
|
|
||||||
"anziche",
|
|
||||||
"certa",
|
|
||||||
"certe",
|
|
||||||
"certi",
|
|
||||||
"certo",
|
|
||||||
"che",
|
|
||||||
"chi",
|
|
||||||
"chiunque",
|
|
||||||
"comunque",
|
|
||||||
"con",
|
|
||||||
"cosa",
|
|
||||||
"cose",
|
|
||||||
"cui",
|
|
||||||
"dagli",
|
|
||||||
"dai",
|
|
||||||
"dall",
|
|
||||||
"dalla",
|
|
||||||
"dalle",
|
|
||||||
"darsi",
|
|
||||||
"degli",
|
|
||||||
"del",
|
|
||||||
"dell",
|
|
||||||
"della",
|
|
||||||
"delle",
|
|
||||||
"dello",
|
|
||||||
"dunque",
|
|
||||||
"egli",
|
|
||||||
"eppure",
|
|
||||||
"esse",
|
|
||||||
"essi",
|
|
||||||
"forse",
|
|
||||||
"gia",
|
|
||||||
"infatti",
|
|
||||||
"inoltre",
|
|
||||||
"invece",
|
|
||||||
"lui",
|
|
||||||
"malgrado",
|
|
||||||
"mediante",
|
|
||||||
"meno",
|
|
||||||
"mentre",
|
|
||||||
"mie",
|
|
||||||
"miei",
|
|
||||||
"mio",
|
|
||||||
"modo",
|
|
||||||
"molta",
|
|
||||||
"molte",
|
|
||||||
"molti",
|
|
||||||
"molto",
|
|
||||||
"negli",
|
|
||||||
"nel",
|
|
||||||
"nella",
|
|
||||||
"nelle",
|
|
||||||
"nessun",
|
|
||||||
"nessuna",
|
|
||||||
"nessuno",
|
|
||||||
"niente",
|
|
||||||
"noi",
|
|
||||||
"nostra",
|
|
||||||
"nostre",
|
|
||||||
"nostri",
|
|
||||||
"nostro",
|
|
||||||
"nulla",
|
|
||||||
"occorre",
|
|
||||||
"ogni",
|
|
||||||
"ognuno",
|
|
||||||
"oltre",
|
|
||||||
"oltretutto",
|
|
||||||
"oppure",
|
|
||||||
"ovunque",
|
|
||||||
"ovvio",
|
|
||||||
"percio",
|
|
||||||
"pertanto",
|
|
||||||
"piu",
|
|
||||||
"piuttosto",
|
|
||||||
"poca",
|
|
||||||
"poco",
|
|
||||||
"poiche",
|
|
||||||
"propri",
|
|
||||||
"proprie",
|
|
||||||
"proprio",
|
|
||||||
"puo",
|
|
||||||
"qua",
|
|
||||||
"qual",
|
|
||||||
"qualche",
|
|
||||||
"qualcuna",
|
|
||||||
"qualcuno",
|
|
||||||
"quale",
|
|
||||||
"quali",
|
|
||||||
"qualunque",
|
|
||||||
"quando",
|
|
||||||
"quant",
|
|
||||||
"quante",
|
|
||||||
"quanti",
|
|
||||||
"quanto",
|
|
||||||
"quantunque",
|
|
||||||
"quegli",
|
|
||||||
"quei",
|
|
||||||
"quest",
|
|
||||||
"questa",
|
|
||||||
"queste",
|
|
||||||
"questi",
|
|
||||||
"questo",
|
|
||||||
"qui",
|
|
||||||
"quindi",
|
|
||||||
"sebbene",
|
|
||||||
"sembra",
|
|
||||||
"sempre",
|
|
||||||
"senza",
|
|
||||||
"soltanto",
|
|
||||||
"stessa",
|
|
||||||
"stesse",
|
|
||||||
"stessi",
|
|
||||||
"stesso",
|
|
||||||
"sugli",
|
|
||||||
"sui",
|
|
||||||
"sul",
|
|
||||||
"sull",
|
|
||||||
"sulla",
|
|
||||||
"sulle",
|
|
||||||
"suo",
|
|
||||||
"suoi",
|
|
||||||
"taluni",
|
|
||||||
"taluno",
|
|
||||||
"tanta",
|
|
||||||
"tanti",
|
|
||||||
"tanto",
|
|
||||||
"tra",
|
|
||||||
"tuo",
|
|
||||||
"tuoi",
|
|
||||||
"tutt",
|
|
||||||
"tutta",
|
|
||||||
"tutte",
|
|
||||||
"tutto",
|
|
||||||
"una",
|
|
||||||
"uno",
|
|
||||||
"voi"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,11 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'punkt-segmenter',
|
|
||||||
'srx-polish'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
processors: {
|
|
||||||
segmenters: [:srx, :punkt]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,291 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'punkt-segmenter',
|
|
||||||
'tactful_tokenizer'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
processors: {
|
|
||||||
segmenters: [:punkt],
|
|
||||||
tokenizers: []
|
|
||||||
}
|
|
||||||
},
|
|
||||||
stop_words:
|
|
||||||
[
|
|
||||||
"abans",
|
|
||||||
"aca",
|
|
||||||
"acerca",
|
|
||||||
"ahora",
|
|
||||||
"aixo",
|
|
||||||
"algo",
|
|
||||||
"algu",
|
|
||||||
"alguien",
|
|
||||||
"algun",
|
|
||||||
"alguna",
|
|
||||||
"algunas",
|
|
||||||
"algunes",
|
|
||||||
"alguno",
|
|
||||||
"algunos",
|
|
||||||
"alguns",
|
|
||||||
"alla",
|
|
||||||
"alli",
|
|
||||||
"allo",
|
|
||||||
"altra",
|
|
||||||
"altre",
|
|
||||||
"altres",
|
|
||||||
"amb",
|
|
||||||
"amunt",
|
|
||||||
"antes",
|
|
||||||
"aquel",
|
|
||||||
"aquell",
|
|
||||||
"aquella",
|
|
||||||
"aquellas",
|
|
||||||
"aquelles",
|
|
||||||
"aquellos",
|
|
||||||
"aquells",
|
|
||||||
"aquest",
|
|
||||||
"aquesta",
|
|
||||||
"aquestes",
|
|
||||||
"aquests",
|
|
||||||
"aqui",
|
|
||||||
"asimismo",
|
|
||||||
"aun",
|
|
||||||
"aunque",
|
|
||||||
"avall",
|
|
||||||
"cada",
|
|
||||||
"casi",
|
|
||||||
"com",
|
|
||||||
"como",
|
|
||||||
"con",
|
|
||||||
"cosas",
|
|
||||||
"coses",
|
|
||||||
"cual",
|
|
||||||
"cuales",
|
|
||||||
"cualquier",
|
|
||||||
"cuando",
|
|
||||||
"damunt",
|
|
||||||
"darrera",
|
|
||||||
"davant",
|
|
||||||
"debe",
|
|
||||||
"deben",
|
|
||||||
"deber",
|
|
||||||
"debia",
|
|
||||||
"debian",
|
|
||||||
"decia",
|
|
||||||
"decian",
|
|
||||||
"decir",
|
|
||||||
"deia",
|
|
||||||
"deien",
|
|
||||||
"del",
|
|
||||||
"demasiado",
|
|
||||||
"des",
|
|
||||||
"desde",
|
|
||||||
"despues",
|
|
||||||
"dicen",
|
|
||||||
"diciendo",
|
|
||||||
"dins",
|
|
||||||
"dir",
|
|
||||||
"diu",
|
|
||||||
"diuen",
|
|
||||||
"doncs",
|
|
||||||
"ell",
|
|
||||||
"ellas",
|
|
||||||
"elles",
|
|
||||||
"ells",
|
|
||||||
"els",
|
|
||||||
"encara",
|
|
||||||
"entonces",
|
|
||||||
"ese",
|
|
||||||
"esos",
|
|
||||||
"esser",
|
|
||||||
"esta",
|
|
||||||
"estan",
|
|
||||||
"estando",
|
|
||||||
"estant",
|
|
||||||
"estar",
|
|
||||||
"estaria",
|
|
||||||
"estarian",
|
|
||||||
"estarien",
|
|
||||||
"estas",
|
|
||||||
"estos",
|
|
||||||
"farien",
|
|
||||||
"feia",
|
|
||||||
"feien",
|
|
||||||
"fent",
|
|
||||||
"fue",
|
|
||||||
"fueron",
|
|
||||||
"gaire",
|
|
||||||
"gairebe",
|
|
||||||
"hace",
|
|
||||||
"hacia",
|
|
||||||
"hacian",
|
|
||||||
"haciendo",
|
|
||||||
"haran",
|
|
||||||
"hauria",
|
|
||||||
"haurien",
|
|
||||||
"hemos",
|
|
||||||
"hola",
|
|
||||||
"junto",
|
|
||||||
"lejos",
|
|
||||||
"les",
|
|
||||||
"lloc",
|
|
||||||
"los",
|
|
||||||
"menos",
|
|
||||||
"menys",
|
|
||||||
"meva",
|
|
||||||
"mias",
|
|
||||||
"mio",
|
|
||||||
"misma",
|
|
||||||
"mismas",
|
|
||||||
"mismo",
|
|
||||||
"mismos",
|
|
||||||
"molt",
|
|
||||||
"molta",
|
|
||||||
"moltes",
|
|
||||||
"mon",
|
|
||||||
"mucha",
|
|
||||||
"mucho",
|
|
||||||
"muy",
|
|
||||||
"nadie",
|
|
||||||
"ningu",
|
|
||||||
"nomes",
|
|
||||||
"nosaltres",
|
|
||||||
"nosotros",
|
|
||||||
"nostra",
|
|
||||||
"nostre",
|
|
||||||
"nuestra",
|
|
||||||
"nuestras",
|
|
||||||
"nuestro",
|
|
||||||
"nuestros",
|
|
||||||
"nunca",
|
|
||||||
"otra",
|
|
||||||
"pasa",
|
|
||||||
"pasan",
|
|
||||||
"pasara",
|
|
||||||
"pasaria",
|
|
||||||
"passara",
|
|
||||||
"passaria",
|
|
||||||
"passen",
|
|
||||||
"perque",
|
|
||||||
"poc",
|
|
||||||
"pocas",
|
|
||||||
"pocos",
|
|
||||||
"podem",
|
|
||||||
"poden",
|
|
||||||
"podeu",
|
|
||||||
"podria",
|
|
||||||
"podrian",
|
|
||||||
"podrien",
|
|
||||||
"poques",
|
|
||||||
"porque",
|
|
||||||
"potser",
|
|
||||||
"puc",
|
|
||||||
"pudieron",
|
|
||||||
"pudo",
|
|
||||||
"puede",
|
|
||||||
"pueden",
|
|
||||||
"puesto",
|
|
||||||
"qualsevol",
|
|
||||||
"quan",
|
|
||||||
"que",
|
|
||||||
"queria",
|
|
||||||
"querian",
|
|
||||||
"qui",
|
|
||||||
"quien",
|
|
||||||
"quienes",
|
|
||||||
"quiere",
|
|
||||||
"quieren",
|
|
||||||
"quin",
|
|
||||||
"quina",
|
|
||||||
"quines",
|
|
||||||
"quins",
|
|
||||||
"quizas",
|
|
||||||
"segueent",
|
|
||||||
"segun",
|
|
||||||
"sempre",
|
|
||||||
"seran",
|
|
||||||
"seria",
|
|
||||||
"serian",
|
|
||||||
"seu",
|
|
||||||
"seva",
|
|
||||||
"sido",
|
|
||||||
"siempre",
|
|
||||||
"siendo",
|
|
||||||
"siguiente",
|
|
||||||
"sino",
|
|
||||||
"sobretodo",
|
|
||||||
"solamente",
|
|
||||||
"sovint",
|
|
||||||
"suya",
|
|
||||||
"suyas",
|
|
||||||
"suyo",
|
|
||||||
"suyos",
|
|
||||||
"tambe",
|
|
||||||
"tambien",
|
|
||||||
"tanmateix",
|
|
||||||
"tanta",
|
|
||||||
"tanto",
|
|
||||||
"tendran",
|
|
||||||
"tendria",
|
|
||||||
"tendrian",
|
|
||||||
"tenen",
|
|
||||||
"teu",
|
|
||||||
"teva",
|
|
||||||
"tiene",
|
|
||||||
"tienen",
|
|
||||||
"tindran",
|
|
||||||
"tindria",
|
|
||||||
"tindrien",
|
|
||||||
"toda",
|
|
||||||
"todavia",
|
|
||||||
"todo",
|
|
||||||
"tota",
|
|
||||||
"totes",
|
|
||||||
"tras",
|
|
||||||
"traves",
|
|
||||||
"tuvieron",
|
|
||||||
"tuvo",
|
|
||||||
"tuya",
|
|
||||||
"tuyas",
|
|
||||||
"tuyo",
|
|
||||||
"tuyos",
|
|
||||||
"unas",
|
|
||||||
"unes",
|
|
||||||
"unos",
|
|
||||||
"uns",
|
|
||||||
"usaba",
|
|
||||||
"usaban",
|
|
||||||
"usada",
|
|
||||||
"usades",
|
|
||||||
"usado",
|
|
||||||
"usan",
|
|
||||||
"usando",
|
|
||||||
"usant",
|
|
||||||
"usar",
|
|
||||||
"usat",
|
|
||||||
"usava",
|
|
||||||
"usaven",
|
|
||||||
"usen",
|
|
||||||
"vaig",
|
|
||||||
"varem",
|
|
||||||
"varen",
|
|
||||||
"vareu",
|
|
||||||
"vegada",
|
|
||||||
"vegades",
|
|
||||||
"vez",
|
|
||||||
"volem",
|
|
||||||
"volen",
|
|
||||||
"voleu",
|
|
||||||
"vora",
|
|
||||||
"vos",
|
|
||||||
"vosaltres",
|
|
||||||
"vosotros",
|
|
||||||
"vostra",
|
|
||||||
"vostre",
|
|
||||||
"voy",
|
|
||||||
"vuestra",
|
|
||||||
"vuestras",
|
|
||||||
"vuestro",
|
|
||||||
"vuestros",
|
|
||||||
"vull"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,289 +0,0 @@
|
||||||
{
|
|
||||||
dependencies: [
|
|
||||||
'punkt-segmenter',
|
|
||||||
'tactful_tokenizer'
|
|
||||||
],
|
|
||||||
workers: {
|
|
||||||
processors: {
|
|
||||||
segmenters: [:punkt],
|
|
||||||
tokenizers: []
|
|
||||||
}
|
|
||||||
},
|
|
||||||
stop_words:
|
|
||||||
[
|
|
||||||
"atminstone",
|
|
||||||
"an",
|
|
||||||
"anda",
|
|
||||||
"aven",
|
|
||||||
"aldrig",
|
|
||||||
"alla",
|
|
||||||
"alls",
|
|
||||||
"allt",
|
|
||||||
"alltid",
|
|
||||||
"allting",
|
|
||||||
"alltsa",
|
|
||||||
"andra",
|
|
||||||
"annan",
|
|
||||||
"annars",
|
|
||||||
"antingen",
|
|
||||||
"att",
|
|
||||||
"bakom",
|
|
||||||
"bland",
|
|
||||||
"blev",
|
|
||||||
"bli",
|
|
||||||
"bliva",
|
|
||||||
"blivit",
|
|
||||||
"bort",
|
|
||||||
"bortom",
|
|
||||||
"bredvid",
|
|
||||||
"dar",
|
|
||||||
"darav",
|
|
||||||
"darefter",
|
|
||||||
"darfor",
|
|
||||||
"dari",
|
|
||||||
"darigenom",
|
|
||||||
"darvid",
|
|
||||||
"dedar",
|
|
||||||
"definitivt",
|
|
||||||
"del",
|
|
||||||
"den",
|
|
||||||
"dendar",
|
|
||||||
"denhar",
|
|
||||||
"denna",
|
|
||||||
"deras",
|
|
||||||
"dessa",
|
|
||||||
"dessutom",
|
|
||||||
"desto",
|
|
||||||
"det",
|
|
||||||
"detta",
|
|
||||||
"dylik",
|
|
||||||
"efterat",
|
|
||||||
"efter",
|
|
||||||
"eftersom",
|
|
||||||
"eller",
|
|
||||||
"emellertid",
|
|
||||||
"enbart",
|
|
||||||
"endast",
|
|
||||||
"enligt",
|
|
||||||
"ens",
|
|
||||||
"ensam",
|
|
||||||
"envar",
|
|
||||||
"eran",
|
|
||||||
"etc",
|
|
||||||
"ett",
|
|
||||||
"exakt",
|
|
||||||
"fatt",
|
|
||||||
"fastan",
|
|
||||||
"fick",
|
|
||||||
"fler",
|
|
||||||
"flera",
|
|
||||||
"foljande",
|
|
||||||
"foljde",
|
|
||||||
"foljer",
|
|
||||||
"for",
|
|
||||||
"fore",
|
|
||||||
"forhoppningsvis",
|
|
||||||
"formodligen",
|
|
||||||
"forr",
|
|
||||||
"forra",
|
|
||||||
"forutom",
|
|
||||||
"forvisso",
|
|
||||||
"fran",
|
|
||||||
"framfor",
|
|
||||||
"fullstandigt",
|
|
||||||
"gang",
|
|
||||||
"gar",
|
|
||||||
"gatt",
|
|
||||||
"ganska",
|
|
||||||
"gav",
|
|
||||||
"genom",
|
|
||||||
"genomgaende",
|
|
||||||
"ger",
|
|
||||||
"gick",
|
|
||||||
"gjorde",
|
|
||||||
"gjort",
|
|
||||||
"gor",
|
|
||||||
"hade",
|
|
||||||
"har",
|
|
||||||
"harav",
|
|
||||||
"har",
|
|
||||||
"hej",
|
|
||||||
"hela",
|
|
||||||
"helst",
|
|
||||||
"helt",
|
|
||||||
"hitta",
|
|
||||||
"hon",
|
|
||||||
"honom",
|
|
||||||
"hur",
|
|
||||||
"huruvida",
|
|
||||||
"huvudsakligen",
|
|
||||||
"ibland",
|
|
||||||
"icke",
|
|
||||||
"ickedestomindre",
|
|
||||||
"igen",
|
|
||||||
"ihop",
|
|
||||||
"inat",
|
|
||||||
"ingen",
|
|
||||||
"ingenstans",
|
|
||||||
"inget",
|
|
||||||
"innan",
|
|
||||||
"innehalla",
|
|
||||||
"inre",
|
|
||||||
"inte",
|
|
||||||
"inuti",
|
|
||||||
"istaellet",
|
|
||||||
"kanske",
|
|
||||||
"klart",
|
|
||||||
"knappast",
|
|
||||||
"knappt",
|
|
||||||
"kom",
|
|
||||||
"komma",
|
|
||||||
"kommer",
|
|
||||||
"kraver",
|
|
||||||
"kunde",
|
|
||||||
"kunna",
|
|
||||||
"lata",
|
|
||||||
"later",
|
|
||||||
"lagga",
|
|
||||||
"langre",
|
|
||||||
"laet",
|
|
||||||
"lagd",
|
|
||||||
"leta",
|
|
||||||
"letar",
|
|
||||||
"manga",
|
|
||||||
"maste",
|
|
||||||
"med",
|
|
||||||
"medan",
|
|
||||||
"medans",
|
|
||||||
"mellan",
|
|
||||||
"mest",
|
|
||||||
"min",
|
|
||||||
"mindre",
|
|
||||||
"minst",
|
|
||||||
"mittemellan",
|
|
||||||
"motsvarande",
|
|
||||||
"mycket",
|
|
||||||
"nagon",
|
|
||||||
"nagongang",
|
|
||||||
"nagonsin",
|
|
||||||
"nagonstans",
|
|
||||||
"nagonting",
|
|
||||||
"nagorlunda",
|
|
||||||
"nagot",
|
|
||||||
"namligen",
|
|
||||||
"nar",
|
|
||||||
"nara",
|
|
||||||
"nasta",
|
|
||||||
"nastan",
|
|
||||||
"nedat",
|
|
||||||
"nedanfor",
|
|
||||||
"nerat",
|
|
||||||
"ner",
|
|
||||||
"nog",
|
|
||||||
"normalt",
|
|
||||||
"nummer",
|
|
||||||
"nuvarande",
|
|
||||||
"nytt",
|
|
||||||
"oavsett",
|
|
||||||
"och",
|
|
||||||
"ocksa",
|
|
||||||
"oppna",
|
|
||||||
"over",
|
|
||||||
"overallt",
|
|
||||||
"ofta",
|
|
||||||
"okej",
|
|
||||||
"olika",
|
|
||||||
"ovanfor",
|
|
||||||
"ratt",
|
|
||||||
"redan",
|
|
||||||
"relativt",
|
|
||||||
"respektive",
|
|
||||||
"rimlig",
|
|
||||||
"rimligen",
|
|
||||||
"rimligt",
|
|
||||||
"salunda",
|
|
||||||
"savida",
|
|
||||||
"saga",
|
|
||||||
"sager",
|
|
||||||
"sakert",
|
|
||||||
"sand",
|
|
||||||
"sarskilt",
|
|
||||||
"satt",
|
|
||||||
"sak",
|
|
||||||
"samma",
|
|
||||||
"samtliga",
|
|
||||||
"sedd",
|
|
||||||
"senare",
|
|
||||||
"senaste",
|
|
||||||
"ser",
|
|
||||||
"sig",
|
|
||||||
"sista",
|
|
||||||
"sjaelv",
|
|
||||||
"ska",
|
|
||||||
"skall",
|
|
||||||
"skickad",
|
|
||||||
"skriva",
|
|
||||||
"skulle",
|
|
||||||
"snabb",
|
|
||||||
"snarare",
|
|
||||||
"snart",
|
|
||||||
"som",
|
|
||||||
"somliga",
|
|
||||||
"speciellt",
|
|
||||||
"stalla",
|
|
||||||
"stallet",
|
|
||||||
"starta",
|
|
||||||
"strax",
|
|
||||||
"stundom",
|
|
||||||
"tackar",
|
|
||||||
"tanka",
|
|
||||||
"taga",
|
|
||||||
"tagen",
|
|
||||||
"tala",
|
|
||||||
"tanke",
|
|
||||||
"tidigare",
|
|
||||||
"tills",
|
|
||||||
"tog",
|
|
||||||
"totalt",
|
|
||||||
"trolig",
|
|
||||||
"troligen",
|
|
||||||
"tvaers",
|
|
||||||
"tvars",
|
|
||||||
"tycka",
|
|
||||||
"tyckte",
|
|
||||||
"tyvarr",
|
|
||||||
"understundom",
|
|
||||||
"upp",
|
|
||||||
"uppenbarligen",
|
|
||||||
"uppenbart",
|
|
||||||
"utan",
|
|
||||||
"utanfor",
|
|
||||||
"uteslutande",
|
|
||||||
"utom",
|
|
||||||
"var",
|
|
||||||
"varan",
|
|
||||||
"vad",
|
|
||||||
"val",
|
|
||||||
"varde",
|
|
||||||
"vanlig",
|
|
||||||
"vanligen",
|
|
||||||
"var",
|
|
||||||
"vare",
|
|
||||||
"varenda",
|
|
||||||
"varfor",
|
|
||||||
"varifran",
|
|
||||||
"varit",
|
|
||||||
"varje",
|
|
||||||
"varken",
|
|
||||||
"vars",
|
|
||||||
"vart",
|
|
||||||
"vem",
|
|
||||||
"verkligen",
|
|
||||||
"vidare",
|
|
||||||
"vilken",
|
|
||||||
"vill",
|
|
||||||
"visar",
|
|
||||||
"visst",
|
|
||||||
"visste"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,16 +0,0 @@
|
||||||
{
|
|
||||||
punkt: {
|
|
||||||
model_path: nil
|
|
||||||
},
|
|
||||||
reuters: {
|
|
||||||
model_path: nil
|
|
||||||
},
|
|
||||||
stanford: {
|
|
||||||
jar_path: nil,
|
|
||||||
model_path: nil
|
|
||||||
},
|
|
||||||
open_nlp: {
|
|
||||||
jar_path: nil,
|
|
||||||
model_path: nil
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,44 +0,0 @@
|
||||||
{
|
|
||||||
categories:
|
|
||||||
['adjective', 'adverb', 'noun',
|
|
||||||
'verb', 'interjection', 'clitic',
|
|
||||||
'coverb', 'conjunction', 'determiner',
|
|
||||||
'particle', 'preposition', 'pronoun',
|
|
||||||
'number', 'symbol', 'punctuation',
|
|
||||||
'complementizer'],
|
|
||||||
|
|
||||||
punctuation: {
|
|
||||||
punct_to_category: {
|
|
||||||
'.' => 'period',
|
|
||||||
',' => 'comma',
|
|
||||||
';' => 'semicolon',
|
|
||||||
':' => 'colon',
|
|
||||||
'?' => 'interrogation',
|
|
||||||
'!' => 'exclamation',
|
|
||||||
'"' => 'double_quote',
|
|
||||||
"'" => 'single_quote',
|
|
||||||
'$' => 'dollar',
|
|
||||||
'%' => 'percent',
|
|
||||||
'#' => 'hash',
|
|
||||||
'*' => 'asterisk',
|
|
||||||
'&' => 'ampersand',
|
|
||||||
'+' => 'plus',
|
|
||||||
'-' => 'dash',
|
|
||||||
'/' => 'slash',
|
|
||||||
'\\' => 'backslash',
|
|
||||||
'^' => 'caret',
|
|
||||||
'_' => 'underscore',
|
|
||||||
'`' => 'tick',
|
|
||||||
'|' => 'pipe',
|
|
||||||
'~' => 'tilde',
|
|
||||||
'@' => 'at',
|
|
||||||
'[' => 'bracket',
|
|
||||||
']' => 'bracket',
|
|
||||||
'{' => 'brace',
|
|
||||||
'}' => 'brace',
|
|
||||||
'(' => 'parenthesis',
|
|
||||||
')' => 'parenthesis',
|
|
||||||
'<' => 'tag',
|
|
||||||
'>' => 'tag'
|
|
||||||
}}
|
|
||||||
}
|
|
|
@ -1,328 +0,0 @@
|
||||||
{
|
|
||||||
aligned: {
|
|
||||||
tag_sets: [
|
|
||||||
:claws_c5, :brown, :penn,
|
|
||||||
:stutgart, :chinese, :paris7
|
|
||||||
],
|
|
||||||
phrase_tags: [
|
|
||||||
'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
|
|
||||||
'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
|
|
||||||
'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
|
|
||||||
'Fragment', ['', '', 'FRAG', '', '', ''],
|
|
||||||
'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
|
|
||||||
'List marker', ['', '', 'LST', '', '', ''],
|
|
||||||
'Not a phrase', ['', '', 'NAC', '', '', ''],
|
|
||||||
'Noun phrase', ['', '', 'NP', '', '', 'NP'],
|
|
||||||
'Verbal nucleus', ['', '', '', '', '', 'VN'],
|
|
||||||
'Head of noun phrase', ['', '', 'NX', '', '', ''],
|
|
||||||
'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
|
|
||||||
'Parenthetical', ['', '', 'PRN', '', '', ''],
|
|
||||||
'Particle', ['', '', 'PRT', '', '', ''],
|
|
||||||
'Participial phrase', ['', '', '', '', '', 'VPart'],
|
|
||||||
'Quantifier phrase', ['', '', 'QP', '', '', ''],
|
|
||||||
'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
|
|
||||||
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
|
||||||
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
|
||||||
'Verb phrase', ['', '', 'VP', '', '', ''],
|
|
||||||
'Inverted yes/no question', ['', '', 'SQ', '', '', ''],
|
|
||||||
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
|
||||||
'Wh adverb phrase', ['', '', 'WHADVP', '', '', ''],
|
|
||||||
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
|
||||||
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
|
||||||
'Unknown', ['', '', 'X', '', '', ''],
|
|
||||||
'Phrase', ['', '', 'P', '', '', 'Sint'],
|
|
||||||
'Sentence', ['', '', 'S', '', '', 'SENT'],
|
|
||||||
'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
|
|
||||||
],
|
|
||||||
word_tags: [
|
|
||||||
|
|
||||||
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
|
||||||
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
|
|
||||||
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
|
|
||||||
'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
|
|
||||||
'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
|
|
||||||
'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
|
|
||||||
'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
|
|
||||||
'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
|
|
||||||
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
|
|
||||||
'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
|
||||||
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
|
||||||
|
|
||||||
'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
|
|
||||||
'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
|
|
||||||
'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
|
|
||||||
'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
|
|
||||||
'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
|
|
||||||
'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
|
|
||||||
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
|
|
||||||
'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
|
|
||||||
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
|
|
||||||
'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
|
|
||||||
'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
|
|
||||||
|
|
||||||
'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
|
|
||||||
'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
|
|
||||||
'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
|
|
||||||
'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
|
|
||||||
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
|
|
||||||
|
|
||||||
'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
|
|
||||||
'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
|
|
||||||
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
|
|
||||||
'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
|
|
||||||
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
|
|
||||||
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
|
|
||||||
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
|
|
||||||
'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
|
|
||||||
'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
|
|
||||||
'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
|
|
||||||
'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
|
|
||||||
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
|
||||||
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
|
||||||
'Interjection', ['', '', '', '', '', 'I'],
|
|
||||||
'Localizer', ['', '', '', '', 'LC'],
|
|
||||||
|
|
||||||
'Measure word', ['', '', '', '', 'M'],
|
|
||||||
|
|
||||||
'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
|
|
||||||
'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
|
|
||||||
'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
|
|
||||||
'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
|
|
||||||
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
|
|
||||||
'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
|
|
||||||
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
|
|
||||||
'Noun, temporal', ['', '', '', '', 'NT', 'N'],
|
|
||||||
'Noun, verbal', ['', '', '', '', 'NN', 'N'],
|
|
||||||
|
|
||||||
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
|
|
||||||
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
|
|
||||||
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
|
|
||||||
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
|
|
||||||
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
|
||||||
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
|
||||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
|
||||||
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # FIXME
|
|
||||||
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
|
||||||
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
|
||||||
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
|
||||||
'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
|
|
||||||
'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
|
|
||||||
'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
|
|
||||||
'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
|
|
||||||
'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
|
|
||||||
'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
|
|
||||||
'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
|
|
||||||
'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
|
|
||||||
'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
|
|
||||||
|
|
||||||
'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
|
|
||||||
'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
|
|
||||||
'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
|
|
||||||
'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
|
|
||||||
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
|
|
||||||
'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
|
|
||||||
'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
|
|
||||||
'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
|
|
||||||
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
|
|
||||||
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
|
|
||||||
'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
|
|
||||||
'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
|
|
||||||
'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
|
|
||||||
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
|
|
||||||
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
|
|
||||||
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
|
|
||||||
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
|
|
||||||
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
|
|
||||||
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
|
|
||||||
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
|
|
||||||
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
|
|
||||||
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
|
|
||||||
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
|
|
||||||
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
|
|
||||||
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
|
|
||||||
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
|
|
||||||
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
|
|
||||||
'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
|
|
||||||
'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
|
|
||||||
'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
|
|
||||||
'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
|
|
||||||
'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
|
|
||||||
|
|
||||||
'Particle', ['', '', '', '', '', 'PRT'],
|
|
||||||
'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
|
|
||||||
'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
|
|
||||||
'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
|
|
||||||
'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
|
|
||||||
'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
|
|
||||||
|
|
||||||
'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
|
|
||||||
'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
|
|
||||||
'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
|
|
||||||
'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
|
|
||||||
'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
|
|
||||||
|
|
||||||
'Possessive', ['POS', '$', 'POS'],
|
|
||||||
|
|
||||||
'Postposition', ['', '', '', 'APPO'],
|
|
||||||
|
|
||||||
'Circumposition, right', ['', '', '', 'APZR', ''],
|
|
||||||
|
|
||||||
'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
|
|
||||||
|
|
||||||
'Onomatopoeia', ['', '', '', '', 'ON'],
|
|
||||||
|
|
||||||
'Punctuation', ['', '', '', '', 'PU', 'PN'],
|
|
||||||
'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
|
|
||||||
|
|
||||||
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
|
||||||
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
|
||||||
'Punctuation, comma', ['PUN', ',', ',', '$,'],
|
|
||||||
'Punctuation, dash', ['PUN', '-', '-'],
|
|
||||||
'Punctuation, dollar sign', ['PUN', '', '$'],
|
|
||||||
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
|
||||||
'Punctuation, right bracket', ['PUR', ')', ')'],
|
|
||||||
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
|
||||||
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
|
||||||
|
|
||||||
'Punctuation, left bracket', ['PUL', '(', 'PPL'],
|
|
||||||
'Punctuation, right bracket', ['PUR', ')', 'PPR'],
|
|
||||||
'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
|
|
||||||
'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
|
|
||||||
'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
|
|
||||||
'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
|
|
||||||
|
|
||||||
'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
|
|
||||||
|
|
||||||
'Symbol', ['', '', 'SYM', 'XY'],
|
|
||||||
'Symbol, alphabetical', ['ZZ0', '', ''],
|
|
||||||
'Symbol, list item', ['', '', 'LS'],
|
|
||||||
|
|
||||||
# Not sure about these tags from the Chinese PTB.
|
|
||||||
'Aspect marker', ['', '', '', '', 'AS'], # ?
|
|
||||||
'Ba-construction', ['', '', '', '', 'BA'], # ?
|
|
||||||
'In relative', ['', '', '', '', 'DEC'], # ?
|
|
||||||
'Associative', ['', '', '', '', 'DER'], # ?
|
|
||||||
'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
|
|
||||||
'For words ? ', ['', '', '', '', 'ETC'], # ?
|
|
||||||
'In long bei-construct', ['', '', '', '', 'LB'], # ?
|
|
||||||
'In short bei-construct', ['', '', '', '', 'SB'], # ?
|
|
||||||
'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
|
|
||||||
'Particle, other', ['', '', '', '', 'MSP'], # ?
|
|
||||||
'Before VP', ['', '', '', '', 'DEV'], # ?
|
|
||||||
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
|
||||||
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
|
||||||
]},
|
|
||||||
enju: {
|
|
||||||
cat_to_category: {
|
|
||||||
'ADJ' => 'adjective',
|
|
||||||
'ADV' => 'adverb',
|
|
||||||
'CONJ' => 'conjunction',
|
|
||||||
'COOD' => 'conjunction',
|
|
||||||
'C' => 'complementizer',
|
|
||||||
'D' => 'determiner',
|
|
||||||
'N' => 'noun',
|
|
||||||
'P' => 'preposition',
|
|
||||||
'PN' => 'punctuation',
|
|
||||||
'SC' => 'conjunction',
|
|
||||||
'V' => 'verb',
|
|
||||||
'PRT' => 'particle'
|
|
||||||
},
|
|
||||||
cat_to_description: [
|
|
||||||
['ADJ', 'Adjective'],
|
|
||||||
['ADV', 'Adverb'],
|
|
||||||
['CONJ', 'Coordination conjunction'],
|
|
||||||
['C', 'Complementizer'],
|
|
||||||
['D', 'Determiner'],
|
|
||||||
['N', 'Noun'],
|
|
||||||
['P', 'Preposition'],
|
|
||||||
['SC', 'Subordination conjunction'],
|
|
||||||
['V', 'Verb'],
|
|
||||||
['COOD', 'Part of coordination'],
|
|
||||||
['PN', 'Punctuation'],
|
|
||||||
['PRT', 'Particle'],
|
|
||||||
['S', 'Sentence']
|
|
||||||
],
|
|
||||||
xcat_to_description: [
|
|
||||||
['COOD', 'Coordinated phrase/clause'],
|
|
||||||
['IMP', 'Imperative sentence'],
|
|
||||||
['INV', 'Subject-verb inversion'],
|
|
||||||
['Q', 'Interrogative sentence with subject-verb inversion'],
|
|
||||||
['REL', 'A relativizer included'],
|
|
||||||
['FREL', 'A free relative included'],
|
|
||||||
['TRACE', 'A trace included'],
|
|
||||||
['WH', 'A wh-question word included']
|
|
||||||
],
|
|
||||||
xcat_to_ptb: [
|
|
||||||
['ADJP', '', 'ADJP'],
|
|
||||||
['ADJP', 'REL', 'WHADJP'],
|
|
||||||
['ADJP', 'FREL', 'WHADJP'],
|
|
||||||
['ADJP', 'WH', 'WHADJP'],
|
|
||||||
['ADVP', '', 'ADVP'],
|
|
||||||
['ADVP', 'REL', 'WHADVP'],
|
|
||||||
['ADVP', 'FREL', 'WHADVP'],
|
|
||||||
['ADVP', 'WH', 'WHADVP'],
|
|
||||||
['CONJP', '', 'CONJP'],
|
|
||||||
['CP', '', 'SBAR'],
|
|
||||||
['DP', '', 'NP'],
|
|
||||||
['NP', '', 'NP'],
|
|
||||||
['NX', 'NX', 'NAC'],
|
|
||||||
['NP' 'REL' 'WHNP'],
|
|
||||||
['NP' 'FREL' 'WHNP'],
|
|
||||||
['NP' 'WH' 'WHNP'],
|
|
||||||
['PP', '', 'PP'],
|
|
||||||
['PP', 'REL', 'WHPP'],
|
|
||||||
['PP', 'WH', 'WHPP'],
|
|
||||||
['PRT', '', 'PRT'],
|
|
||||||
['S', '', 'S'],
|
|
||||||
['S', 'INV', 'SINV'],
|
|
||||||
['S', 'Q', 'SQ'],
|
|
||||||
['S', 'REL', 'SBAR'],
|
|
||||||
['S', 'FREL', 'SBAR'],
|
|
||||||
['S', 'WH', 'SBARQ'],
|
|
||||||
['SCP', '', 'SBAR'],
|
|
||||||
['VP', '', 'VP'],
|
|
||||||
['VP', '', 'VP'],
|
|
||||||
['', '', 'UK']
|
|
||||||
]},
|
|
||||||
paris7: {
|
|
||||||
tag_to_category: {
|
|
||||||
'C' => :complementizer,
|
|
||||||
'PN' => :punctuation,
|
|
||||||
'SC' => :conjunction
|
|
||||||
}
|
|
||||||
# Paris7 Treebank functional tags
|
|
||||||
=begin
|
|
||||||
SUJ (subject)
|
|
||||||
OBJ (direct object)
|
|
||||||
ATS (predicative complement of a subject)
|
|
||||||
ATO (predicative complement of a direct object)
|
|
||||||
MOD (modifier or adjunct)
|
|
||||||
A-OBJ (indirect complement introduced by à)
|
|
||||||
DE-OBJ (indirect complement introduced by de)
|
|
||||||
P-OBJ (indirect complement introduced by another preposition)
|
|
||||||
=end
|
|
||||||
},
|
|
||||||
ptb: {
|
|
||||||
escape_characters: {
|
|
||||||
'(' => '-LRB-',
|
|
||||||
')' => '-RRB-',
|
|
||||||
'[' => '-LSB-',
|
|
||||||
']' => '-RSB-',
|
|
||||||
'{' => '-LCB-',
|
|
||||||
'}' => '-RCB-'
|
|
||||||
},
|
|
||||||
phrase_tag_to_description: [
|
|
||||||
['S', 'Paris7 declarative clause'],
|
|
||||||
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
|
||||||
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
|
||||||
['SINV', 'Inverted declarative sentence'],
|
|
||||||
['SQ', 'Inverted yes/no question']
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1 @@
|
||||||
|
{adapter: :mongo}
|
|
@ -0,0 +1 @@
|
||||||
|
{host: 'localhost', port: '27017', db: nil }
|
|
@ -1,31 +0,0 @@
|
||||||
# Mixin that is extended by Treat::Config
|
|
||||||
# in order to provide a single point of
|
|
||||||
# access method to trigger the import.
|
|
||||||
module Treat::Config::Importable
|
|
||||||
|
|
||||||
# Import relies on each configuration.
|
|
||||||
require_relative 'configurable'
|
|
||||||
|
|
||||||
# Store all the configuration in self.config
|
|
||||||
def self.extended(base)
|
|
||||||
class << base; attr_accessor :config; end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Main function; loads all configuration options.
|
|
||||||
def import!
|
|
||||||
config, c = {}, Treat::Config::Configurable
|
|
||||||
definition = :define_singleton_method
|
|
||||||
Treat::Config.constants.each do |const|
|
|
||||||
next if const.to_s.downcase.is_mixin?
|
|
||||||
klass = Treat::Config.const_get(const)
|
|
||||||
klass.class_eval { extend c }.configure!
|
|
||||||
name = const.to_s.downcase.intern
|
|
||||||
config[name] = klass.config
|
|
||||||
Treat.send(definition, name) do
|
|
||||||
Treat::Config.config[name]
|
|
||||||
end
|
|
||||||
end
|
|
||||||
self.config = config.to_struct
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'psych',
|
||||||
|
'nokogiri',
|
||||||
|
'ferret',
|
||||||
|
'bson_ext',
|
||||||
|
'mongo',
|
||||||
|
'lda-ruby',
|
||||||
|
'stanford-core-nlp',
|
||||||
|
'linguistics',
|
||||||
|
'ruby-readability',
|
||||||
|
'whatlanguage',
|
||||||
|
'chronic',
|
||||||
|
'nickel',
|
||||||
|
'decisiontree',
|
||||||
|
'ai4r'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
extractors: {
|
||||||
|
keywords: [:tf_idf],
|
||||||
|
language: [:what_language]
|
||||||
|
},
|
||||||
|
formatters: {
|
||||||
|
serializers: [:xml, :yaml, :mongo]
|
||||||
|
},
|
||||||
|
lexicalizers: {
|
||||||
|
categorizers: [:from_tag]
|
||||||
|
},
|
||||||
|
inflectors: {
|
||||||
|
ordinalizers: [:linguistics],
|
||||||
|
cardinalizers: [:linguistics]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,7 +6,7 @@
|
||||||
workers: {
|
workers: {
|
||||||
processors: {
|
processors: {
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: [:tactful]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,60 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'rbtagger',
|
||||||
|
'ruby-stemmer',
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer',
|
||||||
|
'nickel',
|
||||||
|
'rwordnet',
|
||||||
|
'uea-stemmer',
|
||||||
|
'engtagger',
|
||||||
|
'activesupport',
|
||||||
|
'english'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
extractors: {
|
||||||
|
time: [:chronic, :ruby, :nickel],
|
||||||
|
topics: [:reuters],
|
||||||
|
keywords: [:tf_idf],
|
||||||
|
name_tag: [:stanford],
|
||||||
|
coreferences: [:stanford]
|
||||||
|
},
|
||||||
|
inflectors: {
|
||||||
|
conjugators: [:linguistics],
|
||||||
|
declensors: [:english, :linguistics, :active_support],
|
||||||
|
stemmers: [:porter, :porter_c, :uea],
|
||||||
|
ordinalizers: [:linguistics],
|
||||||
|
cardinalizers: [:linguistics]
|
||||||
|
},
|
||||||
|
lexicalizers: {
|
||||||
|
taggers: [:lingua, :brill, :stanford],
|
||||||
|
sensers: [:wordnet]
|
||||||
|
},
|
||||||
|
processors: {
|
||||||
|
parsers: [:stanford, :enju],
|
||||||
|
segmenters: [:tactful, :punkt, :stanford],
|
||||||
|
tokenizers: [:ptb, :stanford, :tactful, :punkt]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
info: {
|
||||||
|
stopwords:
|
||||||
|
['the', 'of', 'and', 'a', 'to', 'in', 'is',
|
||||||
|
'you', 'that', 'it', 'he', 'was', 'for', 'on',
|
||||||
|
'are', 'as', 'with', 'his', 'they', 'I', 'at',
|
||||||
|
'be', 'this', 'have', 'from', 'or', 'one', 'had',
|
||||||
|
'by', 'word', 'but', 'not', 'what', 'all', 'were',
|
||||||
|
'we', 'when', 'your', 'can', 'said', 'there', 'use',
|
||||||
|
'an', 'each', 'which', 'she', 'do', 'how', 'their',
|
||||||
|
'if', 'will', 'up', 'other', 'about', 'out', 'many',
|
||||||
|
'then', 'them', 'these', 'so', 'some', 'her', 'would',
|
||||||
|
'make', 'like', 'him', 'into', 'time', 'has', 'look',
|
||||||
|
'two', 'more', 'write', 'go', 'see', 'number', 'no',
|
||||||
|
'way', 'could', 'people', 'my', 'than', 'first', 'been',
|
||||||
|
'call', 'who', 'its', 'now', 'find', 'long', 'down',
|
||||||
|
'day', 'did', 'get', 'come', 'made', 'may', 'part',
|
||||||
|
'say', 'also', 'new', 'much', 'should', 'still',
|
||||||
|
'such', 'before', 'after', 'other', 'then', 'over',
|
||||||
|
'under', 'therefore', 'nonetheless', 'thereafter',
|
||||||
|
'afterwards', 'here', 'huh', 'hah', "n't", "'t", 'here']
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer',
|
||||||
|
'stanford-core-nlp'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
processors: {
|
||||||
|
segmenters: [:punkt],
|
||||||
|
tokenizers: [:tactful],
|
||||||
|
parsers: [:stanford]
|
||||||
|
},
|
||||||
|
lexicalizers: {
|
||||||
|
taggers: [:stanford],
|
||||||
|
categorizers: [:from_tag]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer',
|
||||||
|
'stanford'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
processors: {
|
||||||
|
segmenters: [:punkt],
|
||||||
|
tokenizers: [:tactful],
|
||||||
|
parsers: [:stanford]
|
||||||
|
},
|
||||||
|
lexicalizers: {
|
||||||
|
taggers: [:stanford],
|
||||||
|
categorizers: [:from_tag]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,7 +6,7 @@
|
||||||
workers: {
|
workers: {
|
||||||
processors: {
|
processors: {
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: [:tactful]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
processors: {
|
||||||
|
segmenters: [:punkt],
|
||||||
|
tokenizers: [:tactful]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
processors: {
|
||||||
|
segmenters: [:punkt],
|
||||||
|
tokenizers: [:tactful]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,7 +6,7 @@
|
||||||
workers: {
|
workers: {
|
||||||
processors: {
|
processors: {
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: [:tactful]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -6,7 +6,7 @@
|
||||||
workers: {
|
workers: {
|
||||||
processors: {
|
processors: {
|
||||||
segmenters: [:punkt],
|
segmenters: [:punkt],
|
||||||
tokenizers: []
|
tokenizers: [:tactful]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
processors: {
|
||||||
|
segmenters: [:punkt],
|
||||||
|
tokenizers: [:tactful]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
dependencies: [
|
||||||
|
'punkt-segmenter',
|
||||||
|
'tactful_tokenizer'
|
||||||
|
],
|
||||||
|
workers: {
|
||||||
|
processors: {
|
||||||
|
segmenters: [:punkt],
|
||||||
|
tokenizers: [:tactful]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
{jar_path: nil, model_path: nil}
|
|
@ -0,0 +1,4 @@
|
||||||
|
['adjective', 'adverb', 'noun', 'verb', 'interjection',
|
||||||
|
'clitic', 'coverb', 'conjunction', 'determiner', 'particle',
|
||||||
|
'preposition', 'pronoun', 'number', 'symbol', 'punctuation',
|
||||||
|
'complementizer']
|
|
@ -0,0 +1,33 @@
|
||||||
|
{punct_to_category: {
|
||||||
|
'.' => 'period',
|
||||||
|
',' => 'comma',
|
||||||
|
';' => 'semicolon',
|
||||||
|
':' => 'colon',
|
||||||
|
'?' => 'interrogation',
|
||||||
|
'!' => 'exclamation',
|
||||||
|
'"' => 'double_quote',
|
||||||
|
"'" => 'single_quote',
|
||||||
|
'$' => 'dollar',
|
||||||
|
'%' => 'percent',
|
||||||
|
'#' => 'hash',
|
||||||
|
'*' => 'asterisk',
|
||||||
|
'&' => 'ampersand',
|
||||||
|
'+' => 'plus',
|
||||||
|
'-' => 'dash',
|
||||||
|
'/' => 'slash',
|
||||||
|
'\\' => 'backslash',
|
||||||
|
'^' => 'caret',
|
||||||
|
'_' => 'underscore',
|
||||||
|
'`' => 'tick',
|
||||||
|
'|' => 'pipe',
|
||||||
|
'~' => 'tilde',
|
||||||
|
'@' => 'at',
|
||||||
|
'[' => 'bracket',
|
||||||
|
']' => 'bracket',
|
||||||
|
'{' => 'brace',
|
||||||
|
'}' => 'brace',
|
||||||
|
'(' => 'parenthesis',
|
||||||
|
')' => 'parenthesis',
|
||||||
|
'<' => 'tag',
|
||||||
|
'>' => 'tag'
|
||||||
|
}}
|
|
@ -1,23 +0,0 @@
|
||||||
# Generates the following path config options:
|
|
||||||
# Treat.paths.tmp, Treat.paths.bin, Treat.paths.lib,
|
|
||||||
# Treat.paths.models, Treat.paths.files, Treat.paths.spec.
|
|
||||||
class Treat::Config::Paths
|
|
||||||
|
|
||||||
# Get the path configuration based on the
|
|
||||||
# directory structure loaded into Paths.
|
|
||||||
# Note that this doesn't call super, as
|
|
||||||
# there is no external config files to load.
|
|
||||||
def self.configure!
|
|
||||||
root = File.dirname(File.expand_path( # FIXME
|
|
||||||
__FILE__)).split('/')[0..-4].join('/') + '/'
|
|
||||||
self.config = Hash[
|
|
||||||
# Get a list of directories in treat/
|
|
||||||
Dir.glob(root + '*').select do |path|
|
|
||||||
FileTest.directory?(path)
|
|
||||||
# Map to pairs of [:name, path]
|
|
||||||
end.map do |path|
|
|
||||||
[File.basename(path).intern, path + '/']
|
|
||||||
end]
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -1,37 +0,0 @@
|
||||||
# Handles all configuration related
|
|
||||||
# to understanding of part of speech
|
|
||||||
# and phrasal tags.
|
|
||||||
class Treat::Config::Tags
|
|
||||||
|
|
||||||
# Generate a map of word and phrase tags
|
|
||||||
# to their syntactic category, keyed by
|
|
||||||
# tag set.
|
|
||||||
def self.configure!
|
|
||||||
super
|
|
||||||
config = self.config[:aligned].dup
|
|
||||||
word_tags, phrase_tags, tag_sets =
|
|
||||||
config[:word_tags], config[:phrase_tags]
|
|
||||||
tag_sets = config[:tag_sets]
|
|
||||||
config[:word_tags_to_category] =
|
|
||||||
align_tags(word_tags, tag_sets)
|
|
||||||
config[:phrase_tags_to_category] =
|
|
||||||
align_tags(phrase_tags, tag_sets)
|
|
||||||
self.config[:aligned] = config
|
|
||||||
end
|
|
||||||
|
|
||||||
# Helper methods for tag set config.
|
|
||||||
# Align tag tags in the tag set
|
|
||||||
def self.align_tags(tags, tag_sets)
|
|
||||||
wttc = {}
|
|
||||||
tags.each_slice(2) do |desc, tags|
|
|
||||||
category = desc.gsub(',', ' ,').
|
|
||||||
split(' ')[0].downcase
|
|
||||||
tag_sets.each_with_index do |tag_set, i|
|
|
||||||
next unless tags[i]
|
|
||||||
wttc[tags[i]] ||= {}
|
|
||||||
wttc[tags[i]][tag_set] = category
|
|
||||||
end
|
|
||||||
end; return wttc
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -0,0 +1,221 @@
|
||||||
|
{tag_sets: [
|
||||||
|
:claws_c5, :brown, :penn, :stutgart, :chinese, :paris7
|
||||||
|
],
|
||||||
|
phrase_tags: [
|
||||||
|
'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
|
||||||
|
'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
|
||||||
|
'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
|
||||||
|
'Fragment', ['', '', 'FRAG', '', '', ''],
|
||||||
|
'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
|
||||||
|
'List marker', ['', '', 'LST', '', '', ''],
|
||||||
|
'Not a phrase', ['', '', 'NAC', '', '', ''],
|
||||||
|
'Noun phrase', ['', '', 'NP', '', '', 'NP'],
|
||||||
|
'Verbal nucleus', ['', '', '', '', '', 'VN'],
|
||||||
|
'Head of noun phrase', ['', '', 'NX', '', '', ''],
|
||||||
|
'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
|
||||||
|
'Parenthetical', ['', '', 'PRN', '', '', ''],
|
||||||
|
'Particle', ['', '', 'PRT', '', '', ''],
|
||||||
|
'Participial phrase', ['', '', '', '', '', 'VPart'],
|
||||||
|
'Quantifier phrase', ['', '', 'QP', '', '', ''],
|
||||||
|
'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
|
||||||
|
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
||||||
|
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
||||||
|
'Verb phrase', ['', '', 'VP', '', '', ''],
|
||||||
|
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
||||||
|
'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
|
||||||
|
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
||||||
|
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
||||||
|
'Unknown', ['', '', 'X', '', '', ''],
|
||||||
|
'Phrase', ['', '', 'P', '', '', 'Sint'],
|
||||||
|
'Sentence', ['', '', 'S', '', '', 'SENT'],
|
||||||
|
'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
|
||||||
|
],
|
||||||
|
word_tags: [
|
||||||
|
|
||||||
|
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
||||||
|
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
||||||
|
# 1999. Foundations of Statistical Natural Language
|
||||||
|
# Processing. MIT Press, p. 141-142;
|
||||||
|
# http://www.isocat.org/rest/dcs/376;
|
||||||
|
|
||||||
|
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
|
||||||
|
'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
|
||||||
|
'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
|
||||||
|
'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
|
||||||
|
'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
|
||||||
|
'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
|
||||||
|
'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
|
||||||
|
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
|
||||||
|
'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
||||||
|
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
|
||||||
|
|
||||||
|
'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
|
||||||
|
'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
|
||||||
|
'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
|
||||||
|
'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
|
||||||
|
'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
|
||||||
|
'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
|
||||||
|
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
|
||||||
|
'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
|
||||||
|
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
|
||||||
|
'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
|
||||||
|
'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
|
||||||
|
|
||||||
|
'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
|
||||||
|
'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
|
||||||
|
'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
|
||||||
|
'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
|
||||||
|
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
|
||||||
|
|
||||||
|
'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
|
||||||
|
'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
|
||||||
|
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
|
||||||
|
'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
|
||||||
|
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
|
||||||
|
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
|
||||||
|
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
|
||||||
|
'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
|
||||||
|
'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
|
||||||
|
'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
|
||||||
|
'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
|
||||||
|
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
||||||
|
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
||||||
|
'Interjection', ['', '', '', '', '', 'I'],
|
||||||
|
'Localizer', ['', '', '', '', 'LC'],
|
||||||
|
|
||||||
|
'Measure word', ['', '', '', '', 'M'],
|
||||||
|
|
||||||
|
'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
|
||||||
|
'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
|
||||||
|
'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
|
||||||
|
'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
|
||||||
|
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
|
||||||
|
'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
|
||||||
|
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
|
||||||
|
'Noun, temporal', ['', '', '', '', 'NT', 'N'],
|
||||||
|
'Noun, verbal', ['', '', '', '', 'NN', 'N'],
|
||||||
|
|
||||||
|
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
|
||||||
|
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
|
||||||
|
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
|
||||||
|
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
|
||||||
|
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
|
||||||
|
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
|
||||||
|
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
|
||||||
|
'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
|
||||||
|
'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
|
||||||
|
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
||||||
|
'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
|
||||||
|
'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
|
||||||
|
'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
|
||||||
|
'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
|
||||||
|
'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
|
||||||
|
'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
|
||||||
|
'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
|
||||||
|
'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
|
||||||
|
'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
|
||||||
|
'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
|
||||||
|
|
||||||
|
'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
|
||||||
|
'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
|
||||||
|
'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
|
||||||
|
'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
|
||||||
|
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
|
||||||
|
'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
|
||||||
|
'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
|
||||||
|
'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
|
||||||
|
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
|
||||||
|
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
|
||||||
|
'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
|
||||||
|
'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
|
||||||
|
'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
|
||||||
|
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
|
||||||
|
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
|
||||||
|
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
|
||||||
|
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
|
||||||
|
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
|
||||||
|
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
|
||||||
|
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
|
||||||
|
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
|
||||||
|
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
|
||||||
|
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
|
||||||
|
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
|
||||||
|
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
|
||||||
|
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
|
||||||
|
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
|
||||||
|
'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
|
||||||
|
'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
|
||||||
|
'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
|
||||||
|
'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
|
||||||
|
'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
|
||||||
|
|
||||||
|
'Particle', ['', '', '', '', '', 'PRT'],
|
||||||
|
'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
|
||||||
|
'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
|
||||||
|
'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
|
||||||
|
'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
|
||||||
|
'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
|
||||||
|
|
||||||
|
'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
|
||||||
|
'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
|
||||||
|
'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
|
||||||
|
'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
|
||||||
|
'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
|
||||||
|
|
||||||
|
'Possessive', ['POS', '$', 'POS'],
|
||||||
|
|
||||||
|
'Postposition', ['', '', '', 'APPO'],
|
||||||
|
|
||||||
|
'Circumposition, right', ['', '', '', 'APZR', ''],
|
||||||
|
|
||||||
|
'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
|
||||||
|
|
||||||
|
'Onomatopoeia', ['', '', '', '', 'ON'],
|
||||||
|
|
||||||
|
'Punctuation', ['', '', '', '', 'PU', 'PN'],
|
||||||
|
'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
|
||||||
|
|
||||||
|
'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
|
||||||
|
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
||||||
|
'Punctuationm, comma', ['PUN', ',', ',', '$,'],
|
||||||
|
'Punctuation, dash', ['PUN', '-', '-'],
|
||||||
|
'Punctuation, dollar sign', ['PUN', '', '$'],
|
||||||
|
'Punctuation, left bracket', ['PUL', '(', '(', '$('],
|
||||||
|
'Punctuation, right bracket', ['PUR', ')', ')'],
|
||||||
|
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
||||||
|
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
||||||
|
|
||||||
|
'Punctuation, left bracket', ['PUL', '(', 'PPL'],
|
||||||
|
'Punctuation, right bracket', ['PUR', ')', 'PPR'],
|
||||||
|
'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
|
||||||
|
'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
|
||||||
|
'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
|
||||||
|
'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
|
||||||
|
|
||||||
|
'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
|
||||||
|
|
||||||
|
'Symbol', ['', '', 'SYM', 'XY'],
|
||||||
|
'Symbol, alphabetical', ['ZZ0', '', ''],
|
||||||
|
'Symbol, list item', ['', '', 'LS'],
|
||||||
|
|
||||||
|
# Not sure about these tags from the Chinese PTB.
|
||||||
|
'Aspect marker', ['', '', '', '', 'AS'], # ?
|
||||||
|
'Ba-construction', ['', '', '', '', 'BA'], # ?
|
||||||
|
'In relative', ['', '', '', '', 'DEC'], # ?
|
||||||
|
'Associative', ['', '', '', '', 'DER'], # ?
|
||||||
|
'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
|
||||||
|
'For words ? ', ['', '', '', '', 'ETC'], # ?
|
||||||
|
'In long bei-construct', ['', '', '', '', 'LB'], # ?
|
||||||
|
'In short bei-construct', ['', '', '', '', 'SB'], # ?
|
||||||
|
'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
|
||||||
|
'Particle, other', ['', '', '', '', 'MSP'], # ?
|
||||||
|
'Before VP', ['', '', '', '', 'DEV'], # ?
|
||||||
|
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
||||||
|
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
||||||
|
]}
|
|
@ -0,0 +1,71 @@
|
||||||
|
{cat_to_category: {
|
||||||
|
'ADJ' => 'adjective',
|
||||||
|
'ADV' => 'adverb',
|
||||||
|
'CONJ' => 'conjunction',
|
||||||
|
'COOD' => 'conjunction',
|
||||||
|
'C' => 'complementizer',
|
||||||
|
'D' => 'determiner',
|
||||||
|
'N' => 'noun',
|
||||||
|
'P' => 'preposition',
|
||||||
|
'PN' => 'punctuation',
|
||||||
|
'SC' => 'conjunction',
|
||||||
|
'V' => 'verb',
|
||||||
|
'PRT' => 'particle'
|
||||||
|
},
|
||||||
|
cat_to_description: [
|
||||||
|
['ADJ', 'Adjective'],
|
||||||
|
['ADV', 'Adverb'],
|
||||||
|
['CONJ', 'Coordination conjunction'],
|
||||||
|
['C', 'Complementizer'],
|
||||||
|
['D', 'Determiner'],
|
||||||
|
['N', 'Noun'],
|
||||||
|
['P', 'Preposition'],
|
||||||
|
['SC', 'Subordination conjunction'],
|
||||||
|
['V', 'Verb'],
|
||||||
|
['COOD', 'Part of coordination'],
|
||||||
|
['PN', 'Punctuation'],
|
||||||
|
['PRT', 'Particle'],
|
||||||
|
['S', 'Sentence']
|
||||||
|
],
|
||||||
|
xcat_to_description: [
|
||||||
|
['COOD', 'Coordinated phrase/clause'],
|
||||||
|
['IMP', 'Imperative sentence'],
|
||||||
|
['INV', 'Subject-verb inversion'],
|
||||||
|
['Q', 'Interrogative sentence with subject-verb inversion'],
|
||||||
|
['REL', 'A relativizer included'],
|
||||||
|
['FREL', 'A free relative included'],
|
||||||
|
['TRACE', 'A trace included'],
|
||||||
|
['WH', 'A wh-question word included']
|
||||||
|
],
|
||||||
|
xcat_to_ptb: [
|
||||||
|
['ADJP', '', 'ADJP'],
|
||||||
|
['ADJP', 'REL', 'WHADJP'],
|
||||||
|
['ADJP', 'FREL', 'WHADJP'],
|
||||||
|
['ADJP', 'WH', 'WHADJP'],
|
||||||
|
['ADVP', '', 'ADVP'],
|
||||||
|
['ADVP', 'REL', 'WHADVP'],
|
||||||
|
['ADVP', 'FREL', 'WHADVP'],
|
||||||
|
['ADVP', 'WH', 'WHADVP'],
|
||||||
|
['CONJP', '', 'CONJP'],
|
||||||
|
['CP', '', 'SBAR'],
|
||||||
|
['DP', '', 'NP'],
|
||||||
|
['NP', '', 'NP'],
|
||||||
|
['NX', 'NX', 'NAC'],
|
||||||
|
['NP' 'REL' 'WHNP'],
|
||||||
|
['NP' 'FREL' 'WHNP'],
|
||||||
|
['NP' 'WH' 'WHNP'],
|
||||||
|
['PP', '', 'PP'],
|
||||||
|
['PP', 'REL', 'WHPP'],
|
||||||
|
['PP', 'WH', 'WHPP'],
|
||||||
|
['PRT', '', 'PRT'],
|
||||||
|
['S', '', 'S'],
|
||||||
|
['S', 'INV', 'SINV'],
|
||||||
|
['S', 'Q', 'SQ'],
|
||||||
|
['S', 'REL', 'SBAR'],
|
||||||
|
['S', 'FREL', 'SBAR'],
|
||||||
|
['S', 'WH', 'SBARQ'],
|
||||||
|
['SCP', '', 'SBAR'],
|
||||||
|
['VP', '', 'VP'],
|
||||||
|
['VP', '', 'VP'],
|
||||||
|
['', '', 'UK']
|
||||||
|
]}
|
|
@ -0,0 +1,17 @@
|
||||||
|
{tag_to_category: {
|
||||||
|
'C' => :complementizer,
|
||||||
|
'PN' => :punctuation,
|
||||||
|
'SC' => :conjunction
|
||||||
|
}
|
||||||
|
# Paris7 Treebank functional tags
|
||||||
|
=begin
|
||||||
|
SUJ (subject)
|
||||||
|
OBJ (direct object)
|
||||||
|
ATS (predicative complement of a subject)
|
||||||
|
ATO (predicative complement of a direct object)
|
||||||
|
MOD (modifier or adjunct)
|
||||||
|
A-OBJ (indirect complement introduced by à)
|
||||||
|
DE-OBJ (indirect complement introduced by de)
|
||||||
|
P-OBJ (indirect complement introduced by another preposition)
|
||||||
|
=end
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
{escape_characters: {
|
||||||
|
'(' => '-LRB-',
|
||||||
|
')' => '-RRB-',
|
||||||
|
'[' => '-LSB-',
|
||||||
|
']' => '-RSB-',
|
||||||
|
'{' => '-LCB-',
|
||||||
|
'}' => '-RCB-'
|
||||||
|
},
|
||||||
|
phrase_tag_to_description: [
|
||||||
|
['S', 'Paris7 declarative clause'],
|
||||||
|
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
||||||
|
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
||||||
|
['SINV', 'Inverted declarative sentence'],
|
||||||
|
['SQ', 'Inverted yes/no question']
|
||||||
|
]}
|
|
@ -6,7 +6,7 @@
|
||||||
},
|
},
|
||||||
time: {
|
time: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:group]
|
targets: [:phrase]
|
||||||
},
|
},
|
||||||
topics: {
|
topics: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
|
@ -22,18 +22,18 @@
|
||||||
},
|
},
|
||||||
name_tag: {
|
name_tag: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:group]
|
targets: [:phrase, :word]
|
||||||
|
},
|
||||||
|
coreferences: {
|
||||||
|
type: :annotator,
|
||||||
|
targets: [:zone]
|
||||||
},
|
},
|
||||||
tf_idf: {
|
tf_idf: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:word]
|
targets: [:word]
|
||||||
},
|
},
|
||||||
similarity: {
|
summary: {
|
||||||
type: :computer,
|
type: :annotator,
|
||||||
targets: [:entity]
|
targets: [:document]
|
||||||
},
|
|
||||||
distance: {
|
|
||||||
type: :computer,
|
|
||||||
targets: [:entity]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -1,12 +1,11 @@
|
||||||
{
|
{
|
||||||
taggers: {
|
taggers: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:group, :token],
|
targets: [:phrase, :token]
|
||||||
recursive: true
|
|
||||||
},
|
},
|
||||||
categorizers: {
|
categorizers: {
|
||||||
type: :annotator,
|
type: :annotator,
|
||||||
targets: [:group, :token],
|
targets: [:phrase, :token],
|
||||||
recursive: true
|
recursive: true
|
||||||
},
|
},
|
||||||
sensers: {
|
sensers: {
|
||||||
|
@ -15,5 +14,5 @@
|
||||||
preset_option: :nym,
|
preset_option: :nym,
|
||||||
presets: [:synonyms, :antonyms,
|
presets: [:synonyms, :antonyms,
|
||||||
:hyponyms, :hypernyms],
|
:hyponyms, :hypernyms],
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
[:extractors, :inflectors, :formatters, :learners, :lexicalizers, :processors, :retrievers]
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
chunkers: {
|
chunkers: {
|
||||||
type: :transformer,
|
type: :transformer,
|
||||||
targets: [:document, :section],
|
targets: [:document],
|
||||||
default: :autoselect
|
default: :autoselect
|
||||||
},
|
},
|
||||||
segmenters: {
|
segmenters: {
|
||||||
|
@ -10,10 +10,10 @@
|
||||||
},
|
},
|
||||||
tokenizers: {
|
tokenizers: {
|
||||||
type: :transformer,
|
type: :transformer,
|
||||||
targets: [:group]
|
targets: [:sentence, :phrase]
|
||||||
},
|
},
|
||||||
parsers: {
|
parsers: {
|
||||||
type: :transformer,
|
type: :transformer,
|
||||||
targets: [:group]
|
targets: [:sentence, :phrase]
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
# Contains the core classes used by Treat.
|
||||||
|
module Treat::Core
|
||||||
|
p = Treat.paths.lib + 'treat/core/*.rb'
|
||||||
|
Dir.glob(p).each { |f| require f }
|
||||||
|
end
|
|
@ -2,7 +2,7 @@
|
||||||
# problem as well as data for entities that
|
# problem as well as data for entities that
|
||||||
# have already been classified, complete with
|
# have already been classified, complete with
|
||||||
# references to these entities.
|
# references to these entities.
|
||||||
class Treat::Learning::DataSet
|
class Treat::Core::DataSet
|
||||||
|
|
||||||
# The classification problem this
|
# The classification problem this
|
||||||
# data set holds data for.
|
# data set holds data for.
|
||||||
|
@ -11,31 +11,16 @@ class Treat::Learning::DataSet
|
||||||
# classified (training data).
|
# classified (training data).
|
||||||
attr_accessor :items
|
attr_accessor :items
|
||||||
|
|
||||||
# Initialize the DataSet.
|
# Initialize the DataSet. Can be
|
||||||
|
# done with a Problem entity
|
||||||
|
# (thereby creating an empty set)
|
||||||
|
# or with a filename (representing
|
||||||
|
# a serialized data set which will
|
||||||
|
# then be deserialized and loaded).
|
||||||
def initialize(problem)
|
def initialize(problem)
|
||||||
unless problem.is_a?(Treat::Learning::Problem)
|
|
||||||
raise Treat::Exception, "The first argument " +
|
|
||||||
"to initialize should be an instance of " +
|
|
||||||
"Treat::Learning::Problem."
|
|
||||||
end
|
|
||||||
@problem, @items = problem, []
|
@problem, @items = problem, []
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.build(from)
|
|
||||||
if from.is_a?(Hash)
|
|
||||||
Treat::Learning::DataSet.unserialize(
|
|
||||||
Treat.databases.default.adapter, from)
|
|
||||||
elsif from.is_a?(String)
|
|
||||||
unless File.readable?(from)
|
|
||||||
raise Treat::Exception,
|
|
||||||
"Attempting to initialize data set from " +
|
|
||||||
"file '#{from}', but it is not readable."
|
|
||||||
end
|
|
||||||
Treat::Learning::DataSet.unserialize(
|
|
||||||
File.extname(from)[1..-1], file: from)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Add an entity to the data set. The
|
# Add an entity to the data set. The
|
||||||
# entity's relevant features are
|
# entity's relevant features are
|
||||||
# calculated based on the classification
|
# calculated based on the classification
|
||||||
|
@ -43,9 +28,8 @@ class Treat::Learning::DataSet
|
||||||
# of the calculation is added to the
|
# of the calculation is added to the
|
||||||
# data set, along with the ID of the entity.
|
# data set, along with the ID of the entity.
|
||||||
def <<(entity)
|
def <<(entity)
|
||||||
@items << {
|
@items << { tags: @problem.
|
||||||
tags: (!@problem.tags.empty? ?
|
export_tags(entity),
|
||||||
@problem.export_tags(entity) : []),
|
|
||||||
features: @problem.
|
features: @problem.
|
||||||
export_features(entity),
|
export_features(entity),
|
||||||
id: entity.id }
|
id: entity.id }
|
||||||
|
@ -92,7 +76,7 @@ class Treat::Learning::DataSet
|
||||||
next unless tag.proc_string
|
next unless tag.proc_string
|
||||||
tag.proc = eval(tag.proc_string)
|
tag.proc = eval(tag.proc_string)
|
||||||
end
|
end
|
||||||
data_set = Treat::Learning::DataSet.new(problem)
|
data_set = Treat::Core::DataSet.new(problem)
|
||||||
data_set.items = items
|
data_set.items = items
|
||||||
data_set
|
data_set
|
||||||
end
|
end
|
||||||
|
@ -131,7 +115,7 @@ class Treat::Learning::DataSet
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Couldn't retrieve problem ID #{options[:problem]}."
|
"Couldn't retrieve problem ID #{options[:problem]}."
|
||||||
end
|
end
|
||||||
problem = Treat::Learning::Problem.from_hash(p_record)
|
problem = Treat::Core::Problem.from_hash(p_record)
|
||||||
data = database.collection('data').find(options).to_a
|
data = database.collection('data').find(options).to_a
|
||||||
items = []
|
items = []
|
||||||
data.each do |datum|
|
data.each do |datum|
|
||||||
|
@ -142,7 +126,7 @@ class Treat::Learning::DataSet
|
||||||
item[:id] = datum['id']
|
item[:id] = datum['id']
|
||||||
items << item
|
items << item
|
||||||
end
|
end
|
||||||
data_set = Treat::Learning::DataSet.new(problem)
|
data_set = Treat::Core::DataSet.new(problem)
|
||||||
data_set.items = items
|
data_set.items = items
|
||||||
data_set
|
data_set
|
||||||
end
|
end
|
||||||
|
@ -150,18 +134,13 @@ class Treat::Learning::DataSet
|
||||||
# Merge another data set into this one.
|
# Merge another data set into this one.
|
||||||
def merge(data_set)
|
def merge(data_set)
|
||||||
if data_set.problem != @problem
|
if data_set.problem != @problem
|
||||||
raise Treat::Exception,
|
raise Treat::Exception, # FIXME
|
||||||
"Cannot merge two data sets that " +
|
"Cannot merge two data sets that " +
|
||||||
"don't reference the same problem."
|
"don't reference the same problem."
|
||||||
else
|
else
|
||||||
@items += data_set.items
|
@items << data_set.items
|
||||||
|
@entities << data_set.entities
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Compare with other data set.
|
|
||||||
def ==(data_set)
|
|
||||||
@problem == data_set.problem &&
|
|
||||||
@items == data_set.items
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,23 +0,0 @@
|
||||||
module Treat::Core::DSL
|
|
||||||
|
|
||||||
# Map all classes in Treat::Entities to
|
|
||||||
# a global builder function (entity, word,
|
|
||||||
# phrase, punctuation, symbol, list, etc.)
|
|
||||||
def self.included(base)
|
|
||||||
def method_missing(sym,*args,&block)
|
|
||||||
@@entities ||= Treat.core.entities.list
|
|
||||||
@@learning ||= Treat.core.learning.list
|
|
||||||
if @@entities.include?(sym)
|
|
||||||
klass = Treat::Entities.const_get(sym.cc)
|
|
||||||
return klass.build(*args)
|
|
||||||
elsif @@learning.include?(sym)
|
|
||||||
klass = Treat::Learning.const_get(sym.cc)
|
|
||||||
return klass.new(*args)
|
|
||||||
else
|
|
||||||
super(sym,*args,&block)
|
|
||||||
raise "Uncaught method ended up in Treat DSL."
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Represents a feature to be used
|
||||||
|
# in a classification task.
|
||||||
|
class Treat::Core::Export
|
||||||
|
|
||||||
|
# The name of the feature. If no
|
||||||
|
# proc is supplied, this assumes
|
||||||
|
# that the target of your classification
|
||||||
|
# problem responds to the method
|
||||||
|
# corresponding to this name.
|
||||||
|
attr_reader :name
|
||||||
|
# The feature's default value, if nil.
|
||||||
|
attr_reader :default
|
||||||
|
# A proc that can be used to perform
|
||||||
|
# calculations before storing a feature.
|
||||||
|
attr_accessor :proc
|
||||||
|
# The proc as a string value.
|
||||||
|
attr_accessor :proc_string
|
||||||
|
|
||||||
|
require 'treat/core/hashable'
|
||||||
|
include Treat::Core::Hashable
|
||||||
|
|
||||||
|
# Initialize a feature for a classification problem.
|
||||||
|
def initialize(name, default = nil, proc_string = nil)
|
||||||
|
@name, @default, @proc_string =
|
||||||
|
name, default, proc_string
|
||||||
|
@proc = proc_string ? eval(proc_string) : nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Custom comparison operator for features.
|
||||||
|
def ==(feature)
|
||||||
|
@name == feature.name &&
|
||||||
|
@proc == feature.proc &&
|
||||||
|
@default == feature.default
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
class Treat::Core::Feature < Treat::Core::Export; end
|
||||||
|
class Treat::Core::Tag < Treat::Core::Export; end
|
|
@ -0,0 +1,12 @@
|
||||||
|
module Treat::Core::Hashable
|
||||||
|
|
||||||
|
def to_hash
|
||||||
|
hash = {}
|
||||||
|
instance_variables.each do |var|
|
||||||
|
val = instance_variable_get(var)
|
||||||
|
hash[var.to_s.delete("@")] = val
|
||||||
|
end
|
||||||
|
hash
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -2,7 +2,7 @@
|
||||||
# - What question are we trying to answer?
|
# - What question are we trying to answer?
|
||||||
# - What features are we going to look at
|
# - What features are we going to look at
|
||||||
# to attempt to answer that question?
|
# to attempt to answer that question?
|
||||||
class Treat::Learning::Problem
|
class Treat::Core::Problem
|
||||||
|
|
||||||
# A unique identifier for the problem.
|
# A unique identifier for the problem.
|
||||||
attr_accessor :id
|
attr_accessor :id
|
||||||
|
@ -18,38 +18,32 @@ class Treat::Learning::Problem
|
||||||
attr_reader :tag_labels
|
attr_reader :tag_labels
|
||||||
|
|
||||||
# Initialize the problem with a question
|
# Initialize the problem with a question
|
||||||
# and an arbitrary number of features. # FIXME: init with id!?
|
# and an arbitrary number of features.
|
||||||
def initialize(question, *exports)
|
def initialize(question, *exports)
|
||||||
unless question.is_a?(Treat::Learning::Question)
|
unless question.is_a?(Treat::Core::Question)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"The first argument to initialize " +
|
"The first argument to initialize " +
|
||||||
"should be an instance of " +
|
"should be an instance of " +
|
||||||
"Treat::Learning::Question."
|
"Treat::Core::Question."
|
||||||
end
|
end
|
||||||
if exports.any? { |f| !f.is_a?(Treat::Learning::Export) }
|
if exports.any? { |f| !f.is_a?(Treat::Core::Export) }
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"The second argument and all subsequent ones " +
|
"The second argument and all subsequent ones " +
|
||||||
"to initialize should be instances of subclasses " +
|
"to initialize should be instances of subclasses " +
|
||||||
"of Treat::Learning::Export."
|
"of Treat::Core::Export."
|
||||||
end
|
end
|
||||||
@question, @id = question, object_id
|
@question, @id = question, object_id
|
||||||
@features = exports.select do |exp|
|
@features = exports.select do |exp|
|
||||||
exp.is_a?(Treat::Learning::Feature)
|
exp.is_a?(Treat::Core::Feature)
|
||||||
end
|
|
||||||
if @features.size == 0
|
|
||||||
raise Treat::Exception,
|
|
||||||
"Problem should be supplied with at least "+
|
|
||||||
"one feature to work with."
|
|
||||||
end
|
end
|
||||||
@tags = exports.select do |exp|
|
@tags = exports.select do |exp|
|
||||||
exp.is_a?(Treat::Learning::Tag)
|
exp.is_a?(Treat::Core::Tag)
|
||||||
end
|
end
|
||||||
@feature_labels = @features.map { |f| f.name }
|
@feature_labels = @features.map { |f| f.name }
|
||||||
@tag_labels = @tags.map { |t| t.name }
|
@tag_labels = @tags.map { |t| t.name }
|
||||||
end
|
end
|
||||||
|
|
||||||
# Custom comparison for problems.
|
# Custom comparison for problems.
|
||||||
# Should we check for ID here ? FIXME
|
|
||||||
def ==(problem)
|
def ==(problem)
|
||||||
@question == problem.question &&
|
@question == problem.question &&
|
||||||
@features == problem.features &&
|
@features == problem.features &&
|
||||||
|
@ -63,29 +57,15 @@ class Treat::Learning::Problem
|
||||||
# all of the features.
|
# all of the features.
|
||||||
def export_features(e, include_answer = true)
|
def export_features(e, include_answer = true)
|
||||||
features = export(e, @features)
|
features = export(e, @features)
|
||||||
return features if !include_answer
|
return features unless include_answer
|
||||||
features << (e.has?(@question.name) ?
|
features << (e.has?(@question.name) ?
|
||||||
e.get(@question.name) : @question.default)
|
e.get(@question.name) : @question.default)
|
||||||
features
|
features
|
||||||
end
|
end
|
||||||
|
|
||||||
def export_tags(entity)
|
def export_tags(e); export(e, @tags); end
|
||||||
if @tags.empty?
|
|
||||||
raise Treat::Exception,
|
|
||||||
"Cannot export the tags, because " +
|
|
||||||
"this problem doesn't have any."
|
|
||||||
end
|
|
||||||
export(entity, @tags)
|
|
||||||
end
|
|
||||||
|
|
||||||
def export(entity, exports)
|
def export(entity, exports)
|
||||||
unless @question.target == entity.type
|
|
||||||
targ, type = @question.target, entity.type
|
|
||||||
raise Treat::Exception,
|
|
||||||
"This classification problem targets " +
|
|
||||||
"#{targ}s, but a(n) #{type} " +
|
|
||||||
"was passed to export instead."
|
|
||||||
end
|
|
||||||
ret = []
|
ret = []
|
||||||
exports.each do |export|
|
exports.each do |export|
|
||||||
r = export.proc ?
|
r = export.proc ?
|
||||||
|
@ -97,44 +77,35 @@ class Treat::Learning::Problem
|
||||||
end
|
end
|
||||||
|
|
||||||
def to_hash
|
def to_hash
|
||||||
{'question' => object_to_hash(@question),
|
{'question' => @question.to_hash,
|
||||||
'features' => @features.map { |f|
|
'features' => @features.map { |f|
|
||||||
object_to_hash(f.tap { |f| f.proc = nil }) },
|
f.tap { |f| f.proc = nil }.to_hash },
|
||||||
'tags' => @tags.map { |t|
|
'tags' => @tags.map { |t|
|
||||||
object_to_hash(t.tap { |t| t.proc = nil }) },
|
t.tap { |t| t.proc = nil }.to_hash },
|
||||||
'id' => @id }
|
'id' => @id }
|
||||||
end
|
end
|
||||||
|
|
||||||
def object_to_hash(obj)
|
|
||||||
hash = {}
|
|
||||||
obj.instance_variables.each do |var|
|
|
||||||
val = obj.instance_variable_get(var)
|
|
||||||
hash[var.to_s.delete("@")] = val
|
|
||||||
end
|
|
||||||
hash
|
|
||||||
end
|
|
||||||
|
|
||||||
def self.from_hash(hash)
|
def self.from_hash(hash)
|
||||||
question = Treat::Learning::Question.new(
|
question = Treat::Core::Question.new(
|
||||||
hash['question']['name'],
|
hash['question']['name'],
|
||||||
hash['question']['target'],
|
hash['question']['target'],
|
||||||
hash['question']['default'],
|
hash['question']['type'],
|
||||||
hash['question']['type']
|
hash['question']['default']
|
||||||
)
|
)
|
||||||
features = []
|
features = []
|
||||||
hash['features'].each do |feature|
|
hash['features'].each do |feature|
|
||||||
features << Treat::Learning::Feature.new(
|
features << Treat::Core::Feature.new(
|
||||||
feature['name'], feature['default'],
|
feature['name'], feature['default'],
|
||||||
feature['proc_string'])
|
feature['proc_string'])
|
||||||
end
|
end
|
||||||
tags = []
|
tags = []
|
||||||
hash['tags'].each do |tag|
|
hash['tags'].each do |tag|
|
||||||
tags << Treat::Learning::Tag.new(
|
tags << Treat::Core::Tag.new(
|
||||||
tag['name'], tag['default'],
|
tag['name'], tag['default'],
|
||||||
tag['proc_string'])
|
tag['proc_string'])
|
||||||
end
|
end
|
||||||
features_and_tags = features + tags
|
features_and_tags = features + tags
|
||||||
p = Treat::Learning::Problem.new(question, *features_and_tags)
|
p = Treat::Core::Problem.new(question, *features_and_tags)
|
||||||
p.id = hash['id']
|
p.id = hash['id']
|
||||||
p
|
p
|
||||||
end
|
end
|
|
@ -1,6 +1,9 @@
|
||||||
# Defines a question to answer in the
|
# Defines a question to answer in the
|
||||||
# context of a classification problem.
|
# context of a classification problem.
|
||||||
class Treat::Learning::Question
|
class Treat::Core::Question
|
||||||
|
|
||||||
|
require 'treat/core/hashable'
|
||||||
|
include Treat::Core::Hashable
|
||||||
|
|
||||||
# Defines an arbitrary label for the
|
# Defines an arbitrary label for the
|
||||||
# question we are trying to answer
|
# question we are trying to answer
|
||||||
|
@ -8,32 +11,20 @@ class Treat::Learning::Question
|
||||||
# also be used as the annotation name
|
# also be used as the annotation name
|
||||||
# for the answer to the question.
|
# for the answer to the question.
|
||||||
attr_reader :name
|
attr_reader :name
|
||||||
# Defines the target of the question
|
|
||||||
# (e.g. :sentence, :paragraph, etc.)
|
|
||||||
attr_reader :target
|
|
||||||
# Can be :continuous or :discrete,
|
# Can be :continuous or :discrete,
|
||||||
# depending on the features used.
|
# depending on the features used.
|
||||||
attr_reader :type
|
attr_reader :type
|
||||||
|
# Defines the target of the question
|
||||||
|
# (e.g. :sentence, :paragraph, etc.)
|
||||||
|
attr_reader :target
|
||||||
# Default for the answer to the question.
|
# Default for the answer to the question.
|
||||||
attr_reader :default
|
attr_reader :default
|
||||||
|
|
||||||
# Initialize the question.
|
# Initialize the question.
|
||||||
def initialize(name, target, default = nil, type = :continuous)
|
def initialize(name, target,
|
||||||
unless name.is_a?(Symbol)
|
type = :continuous, default = nil)
|
||||||
raise Treat::Exception,
|
@name, @target = name, target
|
||||||
"Question name should be a symbol."
|
@type, @default = type, default
|
||||||
end
|
|
||||||
unless Treat.core.entities.list.include?(target)
|
|
||||||
raise Treat::Exception, "Target type should be " +
|
|
||||||
"a symbol and should be one of the following: " +
|
|
||||||
Treat.core.entities.list.inspect
|
|
||||||
end
|
|
||||||
unless [:continuous, :discrete].include?(type)
|
|
||||||
raise Treat::Exception, "Type should be " +
|
|
||||||
"continuous or discrete."
|
|
||||||
end
|
|
||||||
@name, @target, @type, @default =
|
|
||||||
name, target, type, default
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Custom comparison operator for questions.
|
# Custom comparison operator for questions.
|
|
@ -1,41 +0,0 @@
|
||||||
class Treat::Core::Server
|
|
||||||
|
|
||||||
# Refer to http://rack.rubyforge.org/doc/classes/Rack/Server.html
|
|
||||||
# for possible options to configure.
|
|
||||||
def initialize(handler = 'thin', options = {})
|
|
||||||
raise "Implementation not finished."
|
|
||||||
require 'json'; require 'rack'
|
|
||||||
@handler, @options = handler.capitalize, options
|
|
||||||
end
|
|
||||||
|
|
||||||
def start
|
|
||||||
handler = Rack::Handler.const_get(@handler)
|
|
||||||
handler.run(self, @options)
|
|
||||||
end
|
|
||||||
|
|
||||||
def call(env)
|
|
||||||
headers = { 'content-type' => 'application/json' }
|
|
||||||
rack_input = env["rack.input"].read
|
|
||||||
if rack_input.strip == ''
|
|
||||||
return [500, headers, {
|
|
||||||
'error' => 'Empty JSON request.'
|
|
||||||
}]
|
|
||||||
end
|
|
||||||
rack_json = JSON.parse(rack_input)
|
|
||||||
unless rack_json['type'] &&
|
|
||||||
rack_json['value'] && rack_json['do']
|
|
||||||
return [500, headers, {
|
|
||||||
'error' => 'Must specify "type", "value" and "do".'
|
|
||||||
}]
|
|
||||||
end
|
|
||||||
if rack_json['conf']
|
|
||||||
# Set the configuration.
|
|
||||||
end
|
|
||||||
method = rack_json['type'].capitalize.intern
|
|
||||||
resp = send(method, rack_json[value]).do(rack_json['do'])
|
|
||||||
|
|
||||||
response = [rack_input.to_json]
|
|
||||||
[200, headers, response]
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
# Contains the textual model used by Treat.
|
||||||
|
module Treat::Entities
|
||||||
|
require 'treat/entities/entity'
|
||||||
|
p = Treat.paths.lib + 'treat/entities/*.rb'
|
||||||
|
Dir.glob(p).each { |f| require f }
|
||||||
|
end
|
|
@ -3,7 +3,7 @@
|
||||||
# a string or a numeric object. This class
|
# a string or a numeric object. This class
|
||||||
# is pretty much self-explanatory.
|
# is pretty much self-explanatory.
|
||||||
# FIXME how can we make this language independent?
|
# FIXME how can we make this language independent?
|
||||||
module Treat::Entities::Entity::Buildable
|
module Treat::Entities::Abilities::Buildable
|
||||||
|
|
||||||
require 'schiphol'
|
require 'schiphol'
|
||||||
require 'fileutils'
|
require 'fileutils'
|
||||||
|
@ -15,21 +15,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
PunctRegexp = /^[[:punct:]\$]+$/
|
PunctRegexp = /^[[:punct:]\$]+$/
|
||||||
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
||||||
EmailRegexp = /.+\@.+\..+/
|
EmailRegexp = /.+\@.+\..+/
|
||||||
Enclitics = [
|
Enclitics = %w['ll 'm 're 's 't 've]
|
||||||
# EXAMPLE:
|
|
||||||
"'d", # I'd => I would
|
|
||||||
"'ll", # I'll => I will
|
|
||||||
"'m", # I'm => I am
|
|
||||||
"'re", # We're => We are
|
|
||||||
"'s", # There's => There is
|
|
||||||
# Let's => Let us
|
|
||||||
"'t", # 'Twas => Archaic ('Twas the night)
|
|
||||||
"'ve", # They've => They have
|
|
||||||
"n't" # Can't => Can not
|
|
||||||
]
|
|
||||||
|
|
||||||
# Accepted formats of serialized files
|
|
||||||
AcceptedFormats = ['.xml', '.yml', '.yaml', '.mongo']
|
|
||||||
|
|
||||||
# Reserved folder names
|
# Reserved folder names
|
||||||
Reserved = ['.index']
|
Reserved = ['.index']
|
||||||
|
@ -37,38 +23,23 @@ module Treat::Entities::Entity::Buildable
|
||||||
# Build an entity from anything (can be
|
# Build an entity from anything (can be
|
||||||
# a string, numeric,folder, or file name
|
# a string, numeric,folder, or file name
|
||||||
# representing a raw or serialized file).
|
# representing a raw or serialized file).
|
||||||
def build(*args)
|
def build(file_or_value, options = {})
|
||||||
|
|
||||||
# This probably needs some doc.
|
|
||||||
if args.size == 0
|
|
||||||
file_or_value = ''
|
|
||||||
elsif args[0].is_a?(Hash)
|
|
||||||
file_or_value = args[0]
|
|
||||||
elsif args.size == 1
|
|
||||||
if args[0].is_a?(Treat::Entities::Entity)
|
|
||||||
args[0] = [args[0]]
|
|
||||||
end
|
|
||||||
file_or_value = args[0]
|
|
||||||
else
|
|
||||||
file_or_value = args
|
|
||||||
end
|
|
||||||
|
|
||||||
fv = file_or_value.to_s
|
fv = file_or_value.to_s
|
||||||
|
|
||||||
if fv == ''; self.new
|
if file_or_value.is_a?(Hash)
|
||||||
elsif file_or_value.is_a?(Array)
|
|
||||||
from_array(file_or_value)
|
|
||||||
elsif file_or_value.is_a?(Hash)
|
|
||||||
from_db(file_or_value)
|
from_db(file_or_value)
|
||||||
elsif self == Treat::Entities::Document || (is_serialized_file?(fv))
|
elsif self == Treat::Entities::Document ||
|
||||||
|
(fv.index('yml') || fv.index('yaml') ||
|
||||||
|
fv.index('xml') || fv.index('mongo'))
|
||||||
if fv =~ UriRegexp
|
if fv =~ UriRegexp
|
||||||
from_url(fv)
|
from_url(fv, options)
|
||||||
else
|
else
|
||||||
from_file(fv)
|
from_file(fv, options)
|
||||||
end
|
end
|
||||||
elsif self == Treat::Entities::Collection
|
elsif self == Treat::Entities::Collection
|
||||||
if FileTest.directory?(fv)
|
if FileTest.directory?(fv)
|
||||||
from_folder(fv)
|
from_folder(fv, options)
|
||||||
else
|
else
|
||||||
create_collection(fv)
|
create_collection(fv)
|
||||||
end
|
end
|
||||||
|
@ -92,34 +63,27 @@ module Treat::Entities::Entity::Buildable
|
||||||
# is user-created (i.e. by calling build
|
# is user-created (i.e. by calling build
|
||||||
# instead of from_string directly).
|
# instead of from_string directly).
|
||||||
def from_string(string, enforce_type = false)
|
def from_string(string, enforce_type = false)
|
||||||
# If calling using the build syntax (i.e. user-
|
|
||||||
# called), enforce the type that was supplied.
|
|
||||||
enforce_type = true if caller_method == :build
|
enforce_type = true if caller_method == :build
|
||||||
|
|
||||||
unless self == Treat::Entities::Entity
|
unless self == Treat::Entities::Entity
|
||||||
return self.new(string) if enforce_type
|
return self.new(string) if enforce_type
|
||||||
end
|
end
|
||||||
e = anything_from_string(string)
|
|
||||||
if enforce_type && !e.is_a?(self)
|
|
||||||
raise "Asked to build a #{self.mn.downcase} "+
|
|
||||||
"from \"#{string}\" and to enforce type, "+
|
|
||||||
"but type detected was #{e.class.mn.downcase}."
|
|
||||||
end
|
|
||||||
e
|
|
||||||
end
|
|
||||||
|
|
||||||
# Build a document from an array
|
e = anything_from_string(string)
|
||||||
# of builders.
|
|
||||||
def from_array(array)
|
if enforce_type && !e.is_a?(self)
|
||||||
obj = self.new
|
raise "Asked to build a #{cl(self).downcase} "+
|
||||||
array.each do |el|
|
"from \"#{string}\" and to enforce type, "+
|
||||||
el = el.to_entity unless el.is_a?(Treat::Entities::Entity)
|
"but type detected was #{cl(e.class).downcase}."
|
||||||
obj << el
|
|
||||||
end
|
end
|
||||||
obj
|
|
||||||
|
e
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build a document from an URL.
|
# Build a document from an URL.
|
||||||
def from_url(url)
|
def from_url(url, options)
|
||||||
unless self ==
|
unless self ==
|
||||||
Treat::Entities::Document
|
Treat::Entities::Document
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
|
@ -127,22 +91,16 @@ module Treat::Entities::Entity::Buildable
|
||||||
'else than a document from a url.'
|
'else than a document from a url.'
|
||||||
end
|
end
|
||||||
|
|
||||||
begin
|
f = Schiphol.download(url,
|
||||||
folder = Treat.paths.files
|
:download_folder => Treat.paths.files,
|
||||||
if folder[-1] == '/'
|
:show_progress => Treat.core.verbosity.silence,
|
||||||
folder = folder[0..-2]
|
:rectify_extensions => true,
|
||||||
end
|
:max_tries => 3
|
||||||
f = Schiphol.download(url,
|
)
|
||||||
download_folder: folder,
|
|
||||||
show_progress: !Treat.core.verbosity.silence,
|
|
||||||
rectify_extensions: true,
|
|
||||||
max_tries: 3)
|
|
||||||
rescue
|
|
||||||
raise Treat::Exception,
|
|
||||||
"Couldn't download file at #{url}."
|
|
||||||
end
|
|
||||||
|
|
||||||
e = from_file(f,'html')
|
options[:default_to] ||= 'html'
|
||||||
|
|
||||||
|
e = from_file(f, options)
|
||||||
e.set :url, url.to_s
|
e.set :url, url.to_s
|
||||||
e
|
e
|
||||||
|
|
||||||
|
@ -165,7 +123,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
|
|
||||||
# Build an entity from a folder with documents.
|
# Build an entity from a folder with documents.
|
||||||
# Folders will be searched recursively.
|
# Folders will be searched recursively.
|
||||||
def from_folder(folder)
|
def from_folder(folder, options)
|
||||||
|
|
||||||
return if Reserved.include?(folder)
|
return if Reserved.include?(folder)
|
||||||
|
|
||||||
|
@ -191,43 +149,39 @@ module Treat::Entities::Entity::Buildable
|
||||||
c = Treat::Entities::Collection.new(folder)
|
c = Treat::Entities::Collection.new(folder)
|
||||||
folder += '/' unless folder[-1] == '/'
|
folder += '/' unless folder[-1] == '/'
|
||||||
|
|
||||||
if !FileTest.directory?(folder)
|
|
||||||
FileUtils.mkdir(folder)
|
|
||||||
end
|
|
||||||
|
|
||||||
c.set :folder, folder
|
|
||||||
i = folder + '/.index'
|
|
||||||
c.set :index, i if FileTest.directory?(i)
|
|
||||||
|
|
||||||
Dir[folder + '*'].each do |f|
|
Dir[folder + '*'].each do |f|
|
||||||
if FileTest.directory?(f)
|
if FileTest.directory?(f)
|
||||||
c2 = Treat::Entities::Collection.
|
c2 = Treat::Entities::Collection.
|
||||||
from_folder(f)
|
from_folder(f, options)
|
||||||
c.<<(c2, false) if c2
|
c.<<(c2, false) if c2
|
||||||
else
|
else
|
||||||
c.<<(Treat::Entities::Document.
|
c.<<(Treat::Entities::Document.
|
||||||
from_file(f), false)
|
from_file(f, options), false)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
c
|
||||||
return c
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build a document from a raw or serialized file.
|
# Build a document from a raw or serialized file.
|
||||||
def from_file(file,def_fmt=nil)
|
def from_file(file, options)
|
||||||
|
|
||||||
if is_serialized_file?(file)
|
if file.index('yml') ||
|
||||||
from_serialized_file(file)
|
file.index('yaml') ||
|
||||||
|
file.index('xml') ||
|
||||||
|
file.index('mongo')
|
||||||
|
from_serialized_file(file, options)
|
||||||
else
|
else
|
||||||
fmt = Treat::Workers::Formatters::Readers::Autoselect.detect_format(file,def_fmt)
|
fmt = Treat::Workers::Formatters::Readers::Autoselect.
|
||||||
from_raw_file(file, fmt)
|
detect_format(file, options[:default_to])
|
||||||
|
options[:_format] = fmt
|
||||||
|
from_raw_file(file, options)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build a document from a raw file.
|
# Build a document from a raw file.
|
||||||
def from_raw_file(file, def_fmt='txt')
|
def from_raw_file(file, options)
|
||||||
|
|
||||||
unless self ==
|
unless self ==
|
||||||
Treat::Entities::Document
|
Treat::Entities::Document
|
||||||
|
@ -241,40 +195,32 @@ module Treat::Entities::Entity::Buildable
|
||||||
"Path '#{file}' does not "+
|
"Path '#{file}' does not "+
|
||||||
"point to a readable file."
|
"point to a readable file."
|
||||||
end
|
end
|
||||||
options = {default_format: def_fmt}
|
|
||||||
d = Treat::Entities::Document.new
|
d = Treat::Entities::Document.new(file)
|
||||||
d.set :file, file
|
|
||||||
d.read(:autoselect, options)
|
d.read(:autoselect, options)
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build an entity from a serialized file.
|
# Build an entity from a serialized file.
|
||||||
def from_serialized_file(file)
|
def from_serialized_file(file, options)
|
||||||
|
|
||||||
unless File.readable?(file)
|
if file.index('mongo')
|
||||||
raise Treat::Exception,
|
options[:id] = file.scan( # Consolidate this
|
||||||
"Path '#{file}' does not "+
|
/([0-9]+)\.mongo/).first.first
|
||||||
"point to a readable file."
|
from_db(:mongo, options)
|
||||||
end
|
|
||||||
doc = Treat::Entities::Document.new
|
|
||||||
doc.set :file, file
|
|
||||||
format = nil
|
|
||||||
if File.extname(file) == '.yml' ||
|
|
||||||
File.extname(file) == '.yaml'
|
|
||||||
format = :yaml
|
|
||||||
elsif File.extname(file) == '.xml'
|
|
||||||
format = :xml
|
|
||||||
else
|
else
|
||||||
raise Treat::Exception,
|
unless File.readable?(file)
|
||||||
"Unreadable serialized format for #{file}."
|
raise Treat::Exception,
|
||||||
|
"Path '#{file}' does not "+
|
||||||
|
"point to a readable file."
|
||||||
|
end
|
||||||
|
d = Treat::Entities::Document.new(file)
|
||||||
|
d.unserialize(:autoselect, options)
|
||||||
|
d.children[0].set_as_root! # Fix this
|
||||||
|
d.children[0]
|
||||||
end
|
end
|
||||||
doc.unserialize(format)
|
|
||||||
doc.children[0].set_as_root! # Fix this
|
|
||||||
doc.children[0]
|
|
||||||
end
|
|
||||||
|
|
||||||
def is_serialized_file?(path_to_check)
|
|
||||||
(AcceptedFormats.include? File.extname(path_to_check)) && (File.file?(path_to_check))
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def from_db(hash)
|
def from_db(hash)
|
||||||
|
@ -292,28 +238,15 @@ module Treat::Entities::Entity::Buildable
|
||||||
|
|
||||||
# Build any kind of entity from a string.
|
# Build any kind of entity from a string.
|
||||||
def anything_from_string(string)
|
def anything_from_string(string)
|
||||||
case self.mn.downcase.intern
|
|
||||||
when :document
|
|
||||||
folder = Treat.paths.files
|
|
||||||
if folder[-1] == '/'
|
|
||||||
folder = folder[0..-2]
|
|
||||||
end
|
|
||||||
|
|
||||||
now = Time.now.to_f
|
case cl(self).downcase.intern
|
||||||
doc_file = folder+ "/#{now}.txt"
|
when :document, :collection
|
||||||
string.force_encoding('UTF-8')
|
|
||||||
File.open(doc_file, 'w') do |f|
|
|
||||||
f.puts string
|
|
||||||
end
|
|
||||||
|
|
||||||
from_raw_file(doc_file)
|
|
||||||
when :collection
|
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Cannot create a " +
|
"Cannot create a document or " +
|
||||||
"collection from a string " +
|
"collection from a string " +
|
||||||
"(need a readable file/folder)."
|
"(need a readable file/folder)."
|
||||||
when :phrase
|
when :phrase
|
||||||
group_from_string(string)
|
sentence_or_phrase_from_string(string)
|
||||||
when :token
|
when :token
|
||||||
token_from_string(string)
|
token_from_string(string)
|
||||||
when :zone
|
when :zone
|
||||||
|
@ -325,7 +258,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
if string.gsub(/[\.\!\?]+/,
|
if string.gsub(/[\.\!\?]+/,
|
||||||
'.').count('.') <= 1 &&
|
'.').count('.') <= 1 &&
|
||||||
string.count("\n") == 0
|
string.count("\n") == 0
|
||||||
group_from_string(string)
|
sentence_or_phrase_from_string(string)
|
||||||
else
|
else
|
||||||
zone_from_string(string)
|
zone_from_string(string)
|
||||||
end
|
end
|
||||||
|
@ -336,14 +269,15 @@ module Treat::Entities::Entity::Buildable
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# This should be improved on.
|
|
||||||
def check_encoding(string)
|
def check_encoding(string)
|
||||||
string.encode("UTF-8", undef: :replace) # Fix
|
string.encode("UTF-8", undef: :replace) # Fix
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build a phrase from a string.
|
# Build a phrase from a string.
|
||||||
def group_from_string(string)
|
def sentence_or_phrase_from_string(string)
|
||||||
|
|
||||||
check_encoding(string)
|
check_encoding(string)
|
||||||
|
|
||||||
if !(string =~ /[a-zA-Z]+/)
|
if !(string =~ /[a-zA-Z]+/)
|
||||||
Treat::Entities::Fragment.new(string)
|
Treat::Entities::Fragment.new(string)
|
||||||
elsif string.count('.!?') >= 1
|
elsif string.count('.!?') >= 1
|
||||||
|
@ -351,6 +285,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
else
|
else
|
||||||
Treat::Entities::Phrase.new(string)
|
Treat::Entities::Phrase.new(string)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Build the right type of token
|
# Build the right type of token
|
||||||
|
@ -396,7 +331,7 @@ module Treat::Entities::Entity::Buildable
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def create_collection(fv)
|
def create_collection(fv)
|
||||||
FileUtils.mkdir(fv)
|
FileUtils.mkdir(fv)
|
||||||
Treat::Entities::Collection.new(fv)
|
Treat::Entities::Collection.new(fv)
|
|
@ -1,7 +1,7 @@
|
||||||
# This module implements methods that are used
|
# This module implements methods that are used
|
||||||
# by workers to determine if an entity is properly
|
# by workers to determine if an entity is properly
|
||||||
# formatted before working on it.
|
# formatted before working on it.
|
||||||
module Treat::Entities::Entity::Checkable
|
module Treat::Entities::Abilities::Checkable
|
||||||
|
|
||||||
# Check if the entity has the given feature,
|
# Check if the entity has the given feature,
|
||||||
# and if so return it. If not, calculate the
|
# and if so return it. If not, calculate the
|
||||||
|
@ -15,7 +15,7 @@ module Treat::Entities::Entity::Checkable
|
||||||
g2 = Treat::Workers.lookup(feature)
|
g2 = Treat::Workers.lookup(feature)
|
||||||
|
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"#{g1.type.to_s.capitalize} " +
|
"#{g1.type.to_s.capitalize} #{task} " +
|
||||||
"requires #{g2.type} #{g2.method}."
|
"requires #{g2.type} #{g2.method}."
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,21 +1,21 @@
|
||||||
# Allow comparison of entity hierarchy in DOM.
|
module Treat::Entities::Abilities::Comparable
|
||||||
module Treat::Entities::Entity::Comparable
|
|
||||||
|
|
||||||
# Determines whether the receiving class
|
|
||||||
# is smaller, equal or greater in the DOM
|
|
||||||
# hierarchy compared to the supplied one.
|
|
||||||
def compare_with(klass)
|
def compare_with(klass)
|
||||||
|
|
||||||
i = 0; rank_a = nil; rank_b = nil
|
i = 0; rank_a = nil; rank_b = nil
|
||||||
|
|
||||||
Treat.core.entities.order.each do |type|
|
Treat.core.entities.order.each do |type|
|
||||||
klass2 = Treat::Entities.const_get(type.cc)
|
klass2 = Treat::Entities.const_get(cc(type))
|
||||||
rank_a = i if self <= klass2
|
rank_a = i if self <= klass2
|
||||||
rank_b = i if klass <= klass2
|
rank_b = i if klass <= klass2
|
||||||
next if rank_a && rank_b
|
next if rank_a && rank_b
|
||||||
i += 1
|
i += 1
|
||||||
end
|
end
|
||||||
|
|
||||||
return -1 if rank_a < rank_b
|
return -1 if rank_a < rank_b
|
||||||
return 0 if rank_a == rank_b
|
return 0 if rank_a == rank_b
|
||||||
return 1 if rank_a > rank_b
|
return 1 if rank_a > rank_b
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -0,0 +1,47 @@
|
||||||
|
module Treat::Entities::Abilities::Copyable
|
||||||
|
|
||||||
|
require 'fileutils'
|
||||||
|
|
||||||
|
# What happens when it is a database-stored
|
||||||
|
# collection or document ?
|
||||||
|
def copy_into(collection)
|
||||||
|
unless collection.is_a?(
|
||||||
|
Treat::Entities::Collection)
|
||||||
|
raise Treat::Exception,
|
||||||
|
"Cannot copy an entity into " +
|
||||||
|
"something else than a collection."
|
||||||
|
end
|
||||||
|
if type == :document
|
||||||
|
copy_document_into(collection)
|
||||||
|
elsif type == :collection
|
||||||
|
copy_collection_into(collection)
|
||||||
|
else
|
||||||
|
raise Treat::Exception,
|
||||||
|
"Can only copy a document " +
|
||||||
|
"or collection into a collection."
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def copy_collection_into(collection)
|
||||||
|
copy = dup
|
||||||
|
f = File.dirname(folder)
|
||||||
|
f = f.split(File::SEPARATOR)[-1]
|
||||||
|
f = File.join(collection.folder, f)
|
||||||
|
FileUtils.mkdir(f) unless
|
||||||
|
FileTest.directory(f)
|
||||||
|
FileUtils.cp_r(folder, f)
|
||||||
|
copy.set :folder, f
|
||||||
|
copy
|
||||||
|
end
|
||||||
|
|
||||||
|
def copy_document_into(collection)
|
||||||
|
copy = dup
|
||||||
|
return copy unless file
|
||||||
|
f = File.basename(file)
|
||||||
|
f = File.join(collection.folder, f)
|
||||||
|
FileUtils.cp(file, f)
|
||||||
|
copy.set :file, f
|
||||||
|
copy
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,4 +1,4 @@
|
||||||
module Treat::Entities::Entity::Countable
|
module Treat::Entities::Abilities::Countable
|
||||||
|
|
||||||
# Find the position of the current entity
|
# Find the position of the current entity
|
||||||
# inside the parent entity, starting at 1.
|
# inside the parent entity, starting at 1.
|
||||||
|
@ -41,7 +41,6 @@ module Treat::Entities::Entity::Countable
|
||||||
# Returns the frequency of the given value
|
# Returns the frequency of the given value
|
||||||
# in the this entity.
|
# in the this entity.
|
||||||
def frequency_of(value)
|
def frequency_of(value)
|
||||||
value = value.downcase
|
|
||||||
if is_a?(Treat::Entities::Token)
|
if is_a?(Treat::Entities::Token)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"Cannot get the frequency " +
|
"Cannot get the frequency " +
|
|
@ -0,0 +1,83 @@
|
||||||
|
# When Treat.debug is set to true, each call to
|
||||||
|
# #call_worker will result in a debug message being
|
||||||
|
# printed by the #print_debug function.
|
||||||
|
module Treat::Entities::Abilities::Debuggable
|
||||||
|
|
||||||
|
@@prev = nil
|
||||||
|
@@i = 0
|
||||||
|
|
||||||
|
# Explains what Treat is currently doing.
|
||||||
|
def print_debug(entity, task, worker, group, options)
|
||||||
|
|
||||||
|
targs = group.targets.map do |target|
|
||||||
|
target.to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
if targs.size == 1
|
||||||
|
t = targs[0]
|
||||||
|
else
|
||||||
|
t = targs[0..-2].join(', ') +
|
||||||
|
' and/or ' + targs[-1]
|
||||||
|
end
|
||||||
|
|
||||||
|
genitive = targs.size > 1 ?
|
||||||
|
'their' : 'its'
|
||||||
|
|
||||||
|
doing = ''
|
||||||
|
|
||||||
|
human_task = task.to_s.gsub('_', ' ')
|
||||||
|
|
||||||
|
if group.type == :transformer ||
|
||||||
|
group.type == :computer
|
||||||
|
|
||||||
|
tt = human_task
|
||||||
|
tt = tt[0..-2] if tt[-1] == 'e'
|
||||||
|
ed = tt[-1] == 'd' ? '' : 'ed'
|
||||||
|
doing = "#{tt.capitalize}#{ed} #{t}"
|
||||||
|
|
||||||
|
elsif group.type == :annotator
|
||||||
|
|
||||||
|
if group.preset_option
|
||||||
|
opt = options[group.preset_option]
|
||||||
|
form = opt.to_s.gsub('_', ' ')
|
||||||
|
human_task[-1] = ''
|
||||||
|
human_task = form + ' ' + human_task
|
||||||
|
end
|
||||||
|
|
||||||
|
doing = "Annotated #{t} with " +
|
||||||
|
"#{genitive} #{human_task}"
|
||||||
|
end
|
||||||
|
|
||||||
|
if group.to_s.index('Formatters')
|
||||||
|
curr = doing +
|
||||||
|
' in format ' +
|
||||||
|
worker.to_s
|
||||||
|
else
|
||||||
|
curr = doing +
|
||||||
|
' using ' +
|
||||||
|
worker.to_s.gsub('_', ' ')
|
||||||
|
end
|
||||||
|
|
||||||
|
curr.gsub!('ss', 's') unless curr.index('class')
|
||||||
|
curr += '.'
|
||||||
|
|
||||||
|
if curr == @@prev
|
||||||
|
@@i += 1
|
||||||
|
else
|
||||||
|
if @@i > 1
|
||||||
|
Treat.core.entities.list.each do |e|
|
||||||
|
@@prev.gsub!(e.to_s, e.to_s + 's')
|
||||||
|
end
|
||||||
|
@@prev.gsub!('its', 'their')
|
||||||
|
@@prev = @@prev.split(' ').
|
||||||
|
insert(1, @@i.to_s).join(' ')
|
||||||
|
end
|
||||||
|
@@i = 0
|
||||||
|
puts @@prev # Last call doesn't get shown.
|
||||||
|
end
|
||||||
|
|
||||||
|
@@prev = curr
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,7 +1,7 @@
|
||||||
# Makes a class delegatable, allowing calls
|
# Makes a class delegatable, allowing calls
|
||||||
# on it to be forwarded to a worker class
|
# on it to be forwarded to a worker class
|
||||||
# able to perform the appropriate task.
|
# able to perform the appropriate task.
|
||||||
module Treat::Entities::Entity::Delegatable
|
module Treat::Entities::Abilities::Delegatable
|
||||||
|
|
||||||
# Add preset methods to an entity class.
|
# Add preset methods to an entity class.
|
||||||
def add_presets(group)
|
def add_presets(group)
|
||||||
|
@ -10,25 +10,27 @@ module Treat::Entities::Entity::Delegatable
|
||||||
return unless opt
|
return unless opt
|
||||||
|
|
||||||
self.class_eval do
|
self.class_eval do
|
||||||
group.presets.each do |preset|
|
group.presets.each do |preset|
|
||||||
define_method(preset) do |worker=nil, options={}|
|
define_method(preset) do |worker=nil, options={}|
|
||||||
return get(preset) if has?(preset)
|
return get(preset) if has?(preset)
|
||||||
options = {opt => preset}.merge(options)
|
options = {opt => preset}.merge(options)
|
||||||
m = group.method
|
m = group.method
|
||||||
send(m, worker, options)
|
send(m, worker, options)
|
||||||
f = unset(m)
|
f = unset(m)
|
||||||
features[preset] = f if f
|
features[preset] = f if f
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Add the workers to perform a task on an entity class.
|
# Add the workers to perform a task on an entity class.
|
||||||
def add_workers(group)
|
def add_workers(group)
|
||||||
self.class_eval do
|
self.class_eval do
|
||||||
|
|
||||||
task = group.method
|
task = group.method
|
||||||
add_presets(group)
|
add_presets(group)
|
||||||
|
|
||||||
define_method(task) do |worker=nil, options={}|
|
define_method(task) do |worker=nil, options={}|
|
||||||
if worker.is_a?(Hash)
|
if worker.is_a?(Hash)
|
||||||
options, worker =
|
options, worker =
|
||||||
|
@ -62,7 +64,7 @@ module Treat::Entities::Entity::Delegatable
|
||||||
worker_not_found(worker, group)
|
worker_not_found(worker, group)
|
||||||
end
|
end
|
||||||
|
|
||||||
worker = group.const_get(worker.to_s.cc.intern)
|
worker = group.const_get(cc(worker.to_s).intern)
|
||||||
result = worker.send(group.method, entity, options)
|
result = worker.send(group.method, entity, options)
|
||||||
|
|
||||||
if group.type == :annotator && result
|
if group.type == :annotator && result
|
||||||
|
@ -88,32 +90,40 @@ module Treat::Entities::Entity::Delegatable
|
||||||
# Get the default worker for that language
|
# Get the default worker for that language
|
||||||
# inside the given group.
|
# inside the given group.
|
||||||
def find_worker_for_language(language, group)
|
def find_worker_for_language(language, group)
|
||||||
|
|
||||||
lang = Treat.languages[language]
|
lang = Treat.languages[language]
|
||||||
cat = group.to_s.split('::')[2].downcase.intern
|
cat = group.to_s.split('::')[2].downcase.intern
|
||||||
group = group.mn.ucc.intern
|
group = ucc(cl(group)).intern
|
||||||
|
|
||||||
if lang.nil?
|
if lang.nil?
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"No configuration file loaded for language #{language}."
|
"No configuration file loaded for language #{language}."
|
||||||
end
|
end
|
||||||
|
|
||||||
workers = lang.workers
|
workers = lang.workers
|
||||||
|
|
||||||
if !workers.respond_to?(cat) ||
|
if !workers.respond_to?(cat) ||
|
||||||
!workers[cat].respond_to?(group)
|
!workers[cat].respond_to?(group)
|
||||||
workers = Treat.languages.agnostic.workers
|
workers = Treat.languages.agnostic.workers
|
||||||
end
|
end
|
||||||
|
|
||||||
if !workers.respond_to?(cat) ||
|
if !workers.respond_to?(cat) ||
|
||||||
!workers[cat].respond_to?(group)
|
!workers[cat].respond_to?(group)
|
||||||
raise Treat::Exception,
|
raise Treat::Exception,
|
||||||
"No #{group} is/are available for the " +
|
"No #{group} is/are available for the " +
|
||||||
"#{language.to_s.capitalize} language."
|
"#{language.to_s.capitalize} language."
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
workers[cat][group].first
|
workers[cat][group].first
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Return an error message and suggest possible typos.
|
# Return an error message and suggest possible typos.
|
||||||
def worker_not_found(worker, group)
|
def worker_not_found(klass, group)
|
||||||
"Worker with name '#{worker}' couldn't be "+
|
"Algorithm '#{ucc(cl(klass))}' couldn't be "+
|
||||||
"found in group #{group}." + Treat::Helpers::Help.
|
"found in group #{group}." + did_you_mean?(
|
||||||
did_you_mean?(group.list.map { |c| c.ucc }, worker)
|
group.list.map { |c| ucc(c) }, ucc(klass))
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,8 +1,8 @@
|
||||||
# Implement support for the functions #do and #do_task.
|
# Implement support for the functions #do and #do_task.
|
||||||
module Treat::Entities::Entity::Applicable
|
module Treat::Entities::Abilities::Doable
|
||||||
|
|
||||||
# Perform the supplied tasks on the entity.
|
# Perform the supplied tasks on the entity.
|
||||||
def apply(*tasks)
|
def do(*tasks)
|
||||||
tasks.each do |task|
|
tasks.each do |task|
|
||||||
|
|
||||||
if task.is_a?(Hash)
|
if task.is_a?(Hash)
|
||||||
|
@ -25,8 +25,6 @@ module Treat::Entities::Entity::Applicable
|
||||||
end
|
end
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
|
||||||
alias :do :apply
|
|
||||||
|
|
||||||
# Perform an individual task on an entity
|
# Perform an individual task on an entity
|
||||||
# given a worker and options to pass to it.
|
# given a worker and options to pass to it.
|
||||||
|
@ -35,7 +33,7 @@ module Treat::Entities::Entity::Applicable
|
||||||
entity_types = group.targets
|
entity_types = group.targets
|
||||||
f = nil
|
f = nil
|
||||||
entity_types.each do |t|
|
entity_types.each do |t|
|
||||||
f = true if is_a?(Treat::Entities.const_get(t.cc))
|
f = true if is_a?(Treat::Entities.const_get(cc(t)))
|
||||||
end
|
end
|
||||||
if f || entity_types.include?(:entity)
|
if f || entity_types.include?(:entity)
|
||||||
send(task, worker, options)
|
send(task, worker, options)
|
|
@ -1,7 +1,7 @@
|
||||||
module Treat::Entities::Entity::Exportable
|
module Treat::Entities::Abilities::Exportable
|
||||||
|
|
||||||
def export(problem)
|
def export(problem)
|
||||||
ds = Treat::Learning::DataSet.new(problem)
|
ds = Treat::Core::DataSet.new(problem)
|
||||||
each_entity(problem.question.target) do |e|
|
each_entity(problem.question.target) do |e|
|
||||||
ds << e
|
ds << e
|
||||||
end
|
end
|
|
@ -1,4 +1,4 @@
|
||||||
module Treat::Entities::Entity::Iterable
|
module Treat::Entities::Abilities::Iterable
|
||||||
|
|
||||||
# Yields each entity of any of the supplied
|
# Yields each entity of any of the supplied
|
||||||
# types in the children tree of this Entity.
|
# types in the children tree of this Entity.
|
||||||
|
@ -6,12 +6,12 @@ module Treat::Entities::Entity::Iterable
|
||||||
# #each. It does not yield the top element being
|
# #each. It does not yield the top element being
|
||||||
# recursed.
|
# recursed.
|
||||||
#
|
#
|
||||||
# This function NEEDS to be ported to C. #FIXME
|
# This function NEEDS to be ported to C.
|
||||||
def each_entity(*types)
|
def each_entity(*types)
|
||||||
types = [:entity] if types.size == 0
|
types = [:entity] if types.size == 0
|
||||||
f = false
|
f = false
|
||||||
types.each do |t2|
|
types.each do |t2|
|
||||||
if is_a?(Treat::Entities.const_get(t2.cc))
|
if is_a?(Treat::Entities.const_get(cc(t2)))
|
||||||
f = true; break
|
f = true; break
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -57,7 +57,7 @@ module Treat::Entities::Entity::Iterable
|
||||||
def ancestor_with_type(type)
|
def ancestor_with_type(type)
|
||||||
return unless has_parent?
|
return unless has_parent?
|
||||||
ancestor = @parent
|
ancestor = @parent
|
||||||
type_klass = Treat::Entities.const_get(type.cc)
|
type_klass = Treat::Entities.const_get(cc(type))
|
||||||
while not ancestor.is_a?(type_klass)
|
while not ancestor.is_a?(type_klass)
|
||||||
return nil unless (ancestor && ancestor.has_parent?)
|
return nil unless (ancestor && ancestor.has_parent?)
|
||||||
ancestor = ancestor.parent
|
ancestor = ancestor.parent
|
||||||
|
@ -94,17 +94,25 @@ module Treat::Entities::Entity::Iterable
|
||||||
end
|
end
|
||||||
|
|
||||||
# Number of children that have a given feature.
|
# Number of children that have a given feature.
|
||||||
# Second variable to allow for passing value to check for.
|
def num_children_with_feature(feature)
|
||||||
def num_children_with_feature(feature, value = nil, recursive = true)
|
|
||||||
i = 0
|
i = 0
|
||||||
m = method(recursive ? :each_entity : :each)
|
each do |c|
|
||||||
m.call do |c|
|
i += 1 if c.has?(feature)
|
||||||
next unless c.has?(feature)
|
|
||||||
i += (value == nil ? 1 :
|
|
||||||
(c.get(feature) == value ? 1 : 0))
|
|
||||||
end
|
end
|
||||||
i
|
i
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Return the first element in the array, warning if not
|
||||||
|
# the only one in the array. Used for magic methods: e.g.,
|
||||||
|
# the magic method "word" if called on a sentence with many
|
||||||
|
# words, Treat will return the first word, but warn the user.
|
||||||
|
def first_but_warn(array, type)
|
||||||
|
if array.size > 1
|
||||||
|
warn "Warning: requested one #{type}, but" +
|
||||||
|
" there are many #{type}s in this entity."
|
||||||
|
end
|
||||||
|
array[0]
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,20 +1,27 @@
|
||||||
module Treat::Entities::Entity::Magical
|
module Treat::Entities::Abilities::Magical
|
||||||
|
|
||||||
# Parse "magic methods", which allow the following
|
# Parse "magic methods", which allow the following
|
||||||
# syntaxes to be used (where 'word' can be replaced
|
# syntaxes to be used (where 'word' can be replaced
|
||||||
# by any entity type, e.g. token, zone, etc.):
|
# by any entity type, e.g. token, zone, etc.):
|
||||||
#
|
#
|
||||||
# - each_word : iterate over each children of type word.
|
# - each_word : iterate over each entity of type word.
|
||||||
# - words: return an array of children words.
|
# - words: return an array of words in the entity.
|
||||||
# - word: return the first word in the entity.
|
# - word: return the first word in the entity.
|
||||||
# - word_count: return the number of words in the entity.
|
# - word_count: return the number of words in the entity.
|
||||||
# - words_with_*(value) (where * is an arbitrary feature):
|
# - words_with_*(value) (where is an arbitrary feature):
|
||||||
# return the words that have the given feature set to value.
|
# return the words that have the given feature.
|
||||||
|
# - word_with_*(value) : return the first word with
|
||||||
|
# the feature specified by * in value.
|
||||||
|
#
|
||||||
|
# Also provides magical methods for types of words:
|
||||||
|
#
|
||||||
|
# - each_noun:
|
||||||
|
# - nouns:
|
||||||
|
# - noun:
|
||||||
|
# - noun_count:
|
||||||
|
# - nouns_with_*(value)
|
||||||
|
# - noun_with_*(value)
|
||||||
#
|
#
|
||||||
# Also provides magical methods for types of words (each_noun,
|
|
||||||
# nouns, noun_count, nouns_with_*(value) noun_with_*(value), etc.)
|
|
||||||
# For this to be used, the words in the text must have been
|
|
||||||
# tokenized and categorized in the first place.
|
|
||||||
def magic(sym, *args)
|
def magic(sym, *args)
|
||||||
|
|
||||||
# Cache this for performance.
|
# Cache this for performance.
|
||||||
|
@ -73,21 +80,9 @@ module Treat::Entities::Entity::Magical
|
||||||
elsif method =~ /^frequency_in_#{@@entities_regexp}$/
|
elsif method =~ /^frequency_in_#{@@entities_regexp}$/
|
||||||
frequency_in($1.intern)
|
frequency_in($1.intern)
|
||||||
else
|
else
|
||||||
return :no_magic # :-(
|
return :no_magic
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Return the first element in the array, warning if not
|
|
||||||
# the only one in the array. Used for magic methods: e.g.,
|
|
||||||
# the magic method "word" if called on a sentence with many
|
|
||||||
# words, Treat will return the first word, but warn the user.
|
|
||||||
def first_but_warn(array, type)
|
|
||||||
if array.size > 1
|
|
||||||
warn "Warning: requested one #{type}, but" +
|
|
||||||
" there are many #{type}s in this entity."
|
|
||||||
end
|
|
||||||
array[0]
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
|
@ -0,0 +1,46 @@
|
||||||
|
# Registers occurences of textual values inside
|
||||||
|
# all children entity. Useful to calculate frequency.
|
||||||
|
module Treat::Entities::Abilities::Registrable
|
||||||
|
|
||||||
|
# Registers a token in the @registry hash.
|
||||||
|
def register(entity)
|
||||||
|
|
||||||
|
unless @registry
|
||||||
|
@count = 0
|
||||||
|
@registry = {
|
||||||
|
:value => {},
|
||||||
|
:position => {},
|
||||||
|
:type => {},
|
||||||
|
:id => {}
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
if entity.is_a?(Treat::Entities::Token) ||
|
||||||
|
entity.is_a?(Treat::Entities::Phrase)
|
||||||
|
val = entity.to_s.downcase
|
||||||
|
@registry[:value][val] ||= 0
|
||||||
|
@registry[:value][val] += 1
|
||||||
|
end
|
||||||
|
|
||||||
|
@registry[:id][entity.id] = true
|
||||||
|
@registry[:type][entity.type] ||= 0
|
||||||
|
@registry[:type][entity.type] += 1
|
||||||
|
@registry[:position][entity.id] = @count
|
||||||
|
@count += 1
|
||||||
|
|
||||||
|
@parent.register(entity) if has_parent?
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
# Backtrack up the tree to find a token registry,
|
||||||
|
# by default the one in the root node of any entity.
|
||||||
|
def registry(type = nil)
|
||||||
|
if has_parent? &&
|
||||||
|
type != self.type
|
||||||
|
@parent.registry(type)
|
||||||
|
else
|
||||||
|
@registry
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
|
@ -1,22 +1,18 @@
|
||||||
# Gives entities the ability to be converted
|
# Gives entities the ability to be converted
|
||||||
# to string representations (#to_string, #to_s,
|
# to string representations (#to_string, #to_s,
|
||||||
# #to_str, #inspect, #print_tree).
|
# #to_str, #inspect, #print_tree).
|
||||||
module Treat::Entities::Entity::Stringable
|
module Treat::Entities::Abilities::Stringable
|
||||||
|
|
||||||
# Returns the entity's true string value.
|
# Return the entity's true string value in
|
||||||
def to_string; @value.dup; end
|
# plain text format. Non-terminal entities
|
||||||
|
# will normally have an empty value.
|
||||||
# Returns an array of the childrens' string
|
def to_string; @value; end
|
||||||
# values, found by calling #to_s on them.
|
|
||||||
def to_a; @children.map { |c| c.to_s }; end
|
|
||||||
|
|
||||||
alias :to_ary :to_a
|
|
||||||
|
|
||||||
# Returns the entity's string value by
|
# Returns the entity's string value by
|
||||||
# imploding the value of all terminal
|
# imploding the value of all terminal
|
||||||
# entities in the subtree of that entity.
|
# entities in the subtree of that entity.
|
||||||
def to_s
|
def to_s
|
||||||
has_children? ? implode.strip : @value.dup
|
@value != '' ? @value : implode.strip
|
||||||
end
|
end
|
||||||
|
|
||||||
# #to_str is the same as #to_s.
|
# #to_str is the same as #to_s.
|
||||||
|
@ -28,10 +24,12 @@ module Treat::Entities::Entity::Stringable
|
||||||
def short_value(max_length = 30)
|
def short_value(max_length = 30)
|
||||||
s = to_s
|
s = to_s
|
||||||
words = s.split(' ')
|
words = s.split(' ')
|
||||||
return s if (s.length < max_length) ||
|
if s.length < max_length
|
||||||
!(words[0..2] && words[-2..-1])
|
s
|
||||||
words[0..2].join(' ') + ' [...] ' +
|
else
|
||||||
words[-2..-1].join(' ')
|
words[0..2].join(' ') + ' [...] ' +
|
||||||
|
words[-2..-1].join(' ')
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Print out an ASCII representation of the tree.
|
# Print out an ASCII representation of the tree.
|
||||||
|
@ -40,8 +38,7 @@ module Treat::Entities::Entity::Stringable
|
||||||
# Return an informative string representation
|
# Return an informative string representation
|
||||||
# of the entity.
|
# of the entity.
|
||||||
def inspect
|
def inspect
|
||||||
name = self.class.mn
|
s = "#{cl(self.class)} (#{@id.to_s})"
|
||||||
s = "#{name} (#{@id.to_s})"
|
|
||||||
if caller_method(2) == :inspect
|
if caller_method(2) == :inspect
|
||||||
@id.to_s
|
@id.to_s
|
||||||
else
|
else
|
||||||
|
@ -58,14 +55,16 @@ module Treat::Entities::Entity::Stringable
|
||||||
end
|
end
|
||||||
|
|
||||||
# Helper method to implode the string value of the subtree.
|
# Helper method to implode the string value of the subtree.
|
||||||
def implode(value = "")
|
def implode
|
||||||
|
|
||||||
return @value.dup if !has_children?
|
return @value.dup if !has_children?
|
||||||
|
|
||||||
|
value = ''
|
||||||
|
|
||||||
each do |child|
|
each do |child|
|
||||||
|
|
||||||
if child.is_a?(Treat::Entities::Section)
|
if child.is_a?(Treat::Entities::Section)
|
||||||
value << "\n\n"
|
value += "\n\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
if child.is_a?(Treat::Entities::Token) || child.value != ''
|
||||||
|
@ -73,14 +72,14 @@ module Treat::Entities::Entity::Stringable
|
||||||
child.is_a?(Treat::Entities::Enclitic)
|
child.is_a?(Treat::Entities::Enclitic)
|
||||||
value.strip!
|
value.strip!
|
||||||
end
|
end
|
||||||
value << child.to_s + ' '
|
value += child.to_s + ' '
|
||||||
else
|
else
|
||||||
child.implode(value)
|
value += child.implode
|
||||||
end
|
end
|
||||||
|
|
||||||
if child.is_a?(Treat::Entities::Title) ||
|
if child.is_a?(Treat::Entities::Title) ||
|
||||||
child.is_a?(Treat::Entities::Paragraph)
|
child.is_a?(Treat::Entities::Paragraph)
|
||||||
value << "\n\n"
|
value += "\n\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -0,0 +1,40 @@
|
||||||
|
module Treat::Entities
|
||||||
|
# Represents a collection of texts.
|
||||||
|
class Collection < Entity
|
||||||
|
|
||||||
|
# Initialize the collection with a folder
|
||||||
|
# containing the texts of the collection.
|
||||||
|
def initialize(folder = nil, id = nil)
|
||||||
|
super('', id)
|
||||||
|
if folder
|
||||||
|
if !FileTest.directory?(folder)
|
||||||
|
FileUtils.mkdir(folder)
|
||||||
|
end
|
||||||
|
set :folder, folder if folder
|
||||||
|
i = folder + '/.index'
|
||||||
|
if FileTest.directory?(i)
|
||||||
|
set :index, i
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Works like the default <<, but if the
|
||||||
|
# file being added is a collection or a
|
||||||
|
# document, then copy that collection or
|
||||||
|
# document into this collection's folder.
|
||||||
|
def <<(entities, copy = true)
|
||||||
|
unless entities.is_a?(Array)
|
||||||
|
entities = [entities]
|
||||||
|
end
|
||||||
|
entities.each do |entity|
|
||||||
|
if [:document, :collection].
|
||||||
|
include?(entity.type) && copy &&
|
||||||
|
@features[:folder] != nil
|
||||||
|
entity = entity.copy_into(self)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
super(entities)
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,10 @@
|
||||||
|
module Treat::Entities
|
||||||
|
# Represents a document.
|
||||||
|
class Document < Entity
|
||||||
|
# Initialize a document with a file name.
|
||||||
|
def initialize(file = nil, id = nil)
|
||||||
|
super('', id)
|
||||||
|
set :file, file
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,101 +0,0 @@
|
||||||
module Treat::Entities
|
|
||||||
|
|
||||||
# * Collection and document classes * #
|
|
||||||
|
|
||||||
# Represents a collection.
|
|
||||||
class Collection < Entity; end
|
|
||||||
|
|
||||||
# Represents a document.
|
|
||||||
class Document < Entity; end
|
|
||||||
|
|
||||||
# * Sections and related classes * #
|
|
||||||
|
|
||||||
# Represents a section.
|
|
||||||
class Section < Entity; end
|
|
||||||
|
|
||||||
# Represents a page of text.
|
|
||||||
class Page < Section; end
|
|
||||||
|
|
||||||
# Represents a block of text
|
|
||||||
class Block < Section; end
|
|
||||||
|
|
||||||
# Represents a list.
|
|
||||||
class List < Section; end
|
|
||||||
|
|
||||||
# * Zones and related classes * #
|
|
||||||
|
|
||||||
# Represents a zone of text.
|
|
||||||
class Zone < Entity; end
|
|
||||||
|
|
||||||
# Represents a title, subtitle,
|
|
||||||
# logical header of a text.
|
|
||||||
class Title < Zone; end
|
|
||||||
|
|
||||||
# Represents a paragraph (group
|
|
||||||
# of sentences and/or phrases).
|
|
||||||
class Paragraph < Zone; end
|
|
||||||
|
|
||||||
# * Groups and related classes * #
|
|
||||||
|
|
||||||
# Represents a group of tokens.
|
|
||||||
class Group < Entity; end
|
|
||||||
|
|
||||||
# Represents a group of words
|
|
||||||
# with a sentence ender (.!?)
|
|
||||||
class Sentence < Group; end
|
|
||||||
|
|
||||||
# Represents a group of words,
|
|
||||||
# with no sentence ender.
|
|
||||||
class Phrase < Group; end
|
|
||||||
|
|
||||||
# Represents a non-linguistic
|
|
||||||
# fragment (e.g. stray symbols).
|
|
||||||
class Fragment < Group; end
|
|
||||||
|
|
||||||
# * Tokens and related classes* #
|
|
||||||
|
|
||||||
# Represents a terminal element
|
|
||||||
# (leaf) in the text structure.
|
|
||||||
class Token < Entity; end
|
|
||||||
|
|
||||||
# Represents a word. Strictly,
|
|
||||||
# this is /^[[:alpha:]\-']+$/.
|
|
||||||
class Word < Token; end
|
|
||||||
|
|
||||||
# Represents an enclitic.
|
|
||||||
# Strictly, this is any of
|
|
||||||
# 'll 'm 're 's 't or 've.
|
|
||||||
class Enclitic < Token; end
|
|
||||||
|
|
||||||
# Represents a number. Strictly,
|
|
||||||
# this is /^#?([0-9]+)(\.[0-9]+)?$/.
|
|
||||||
class Number < Token
|
|
||||||
def to_i; to_s.to_i; end
|
|
||||||
def to_f; to_s.to_f; end
|
|
||||||
end
|
|
||||||
|
|
||||||
# Represents a punctuation sign.
|
|
||||||
# Strictly, this is /^[[:punct:]\$]+$/.
|
|
||||||
class Punctuation < Token; end
|
|
||||||
|
|
||||||
# Represents a character that is neither
|
|
||||||
# a word, an enclitic, a number or a
|
|
||||||
# punctuation character (e.g. @#$%&*).
|
|
||||||
class Symbol < Token; end
|
|
||||||
|
|
||||||
# Represents a url. This is (imperfectly)
|
|
||||||
# defined as /^(http|https):\/\/[a-z0-9]
|
|
||||||
# +([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}
|
|
||||||
# (([0-9]{1,5})?\/.*)?$/ix
|
|
||||||
class Url < Token; end
|
|
||||||
|
|
||||||
# Represents a valid RFC822 address.
|
|
||||||
# This is (imperfectly) defined as
|
|
||||||
# /.+\@.+\..+/ (fixme maybe?)
|
|
||||||
class Email < Token; end
|
|
||||||
|
|
||||||
# Represents a token whose type
|
|
||||||
# cannot be identified.
|
|
||||||
class Unknown; end
|
|
||||||
|
|
||||||
end
|
|
|
@ -1,106 +1,94 @@
|
||||||
module Treat::Entities
|
module Treat::Entities
|
||||||
|
|
||||||
# Basic tree structure.
|
module Abilities; end
|
||||||
|
|
||||||
|
# Require abilities.
|
||||||
|
p = Treat.paths.lib +
|
||||||
|
'treat/entities/abilities/*.rb'
|
||||||
|
Dir.glob(p).each { |f| require f }
|
||||||
|
|
||||||
require 'birch'
|
require 'birch'
|
||||||
|
|
||||||
# The Entity class extends a basic tree structure
|
|
||||||
# (written in C for optimal speed) and represents
|
|
||||||
# any form of textual entityin a processing task
|
|
||||||
# (this could be a collection of documents, a
|
|
||||||
# single document, a single paragraph, etc.)
|
|
||||||
#
|
|
||||||
# Classes that extend Entity provide the concrete
|
|
||||||
# behavior corresponding to the relevant entity type.
|
|
||||||
# See entities.rb for a full list and description of
|
|
||||||
# the different entity types in the document model.
|
|
||||||
class Entity < ::Birch::Tree
|
class Entity < ::Birch::Tree
|
||||||
|
|
||||||
# A symbol representing the lowercase
|
# A Symbol representing the lowercase
|
||||||
# version of the class name. This is
|
# version of the class name.
|
||||||
# the only attribute that the Entity
|
|
||||||
# class adds to the Birch::Tree class.
|
|
||||||
attr_accessor :type
|
attr_accessor :type
|
||||||
|
|
||||||
# Autoload all the classes in /abilities.
|
|
||||||
path = File.expand_path(__FILE__)
|
|
||||||
patt = File.dirname(path) + '/entity/*.rb'
|
|
||||||
Dir.glob(patt).each { |f| require f }
|
|
||||||
|
|
||||||
# Implements support for #register, #registry.
|
|
||||||
include Registrable
|
|
||||||
|
|
||||||
# Implement support for #self.call_worker, etc.
|
# Implements support for #register,
|
||||||
extend Delegatable
|
# #registry, and #contains_* methods.
|
||||||
|
include Abilities::Registrable
|
||||||
|
|
||||||
# Implement support for #self.print_debug, etc.
|
# Implement support for #self.add_workers
|
||||||
extend Debuggable
|
extend Abilities::Delegatable
|
||||||
|
|
||||||
# Implement support for #self.build and #self.from_*
|
# Implement support for #self.print_debug and
|
||||||
extend Buildable
|
# #self.invalid_call_msg
|
||||||
|
extend Abilities::Debuggable
|
||||||
|
|
||||||
# Implement support for #apply (previously #do).
|
# Implement support for #self.build
|
||||||
include Applicable
|
# and #self.from_*
|
||||||
|
extend Abilities::Buildable
|
||||||
|
|
||||||
# Implement support for #frequency, #frequency_in,
|
# Implement support for #do.
|
||||||
# #frequency_of, #position, #position_from_end, etc.
|
include Abilities::Doable
|
||||||
include Countable
|
|
||||||
|
|
||||||
# Implement support for over 100 #magic methods!
|
# Implement support for #frequency,
|
||||||
include Magical
|
# #frequency_in_parent and #position_in_parent.
|
||||||
|
include Abilities::Countable
|
||||||
|
|
||||||
|
# Implement support for #magic.
|
||||||
|
include Abilities::Magical
|
||||||
|
|
||||||
# Implement support for #to_s, #inspect, etc.
|
# Implement support for #to_s, #inspect, etc.
|
||||||
include Stringable
|
include Abilities::Stringable
|
||||||
|
|
||||||
# Implement support for #check_has and others.
|
# Implement support for #check_has
|
||||||
include Checkable
|
# and #check_hasnt_children?
|
||||||
|
include Abilities::Checkable
|
||||||
|
|
||||||
# Implement support for #each_entity, as well as
|
# Implement support for #each_entity, as well as
|
||||||
# #entities_with_type, #ancestors_with_type,
|
# #entities_with_type, #ancestors_with_type,
|
||||||
# #entities_with_feature, #entities_with_category, etc.
|
# #entities_with_feature, #entities_with_category.
|
||||||
include Iterable
|
include Abilities::Iterable
|
||||||
|
|
||||||
# Implement support for #export, allowing to export
|
# Implement support for #export to export
|
||||||
# a data set row from the receiving entity.
|
# a line of a data set based on a classification.
|
||||||
include Exportable
|
include Abilities::Exportable
|
||||||
|
|
||||||
|
# Implement support for #copy_into.
|
||||||
|
include Abilities::Copyable
|
||||||
|
|
||||||
# Implement support for #self.compare_with
|
# Implement support for #self.compare_with
|
||||||
extend Comparable
|
extend Abilities::Comparable
|
||||||
|
|
||||||
# Initialize the entity with its value and
|
# Initialize the entity with its value and
|
||||||
# (optionally) a unique identifier. By default,
|
# (optionally) a unique identifier. By default,
|
||||||
# the object_id will be used as id.
|
# the object_id will be used as id.
|
||||||
def initialize(value = '', id = nil)
|
def initialize(value = '', id = nil)
|
||||||
id ||= object_id; super(value, id)
|
id ||= object_id
|
||||||
|
super(value, id)
|
||||||
@type = :entity if self == Entity
|
@type = :entity if self == Entity
|
||||||
@type ||= self.class.mn.ucc.intern
|
@type ||= ucc(cl(self.class)).intern
|
||||||
end
|
end
|
||||||
|
|
||||||
# Add an entity to the current entity.
|
# Add an entity to the current entity.
|
||||||
# Registers the entity in the root node
|
# Registers the entity in the root node
|
||||||
# token registry if the entity is a leaf.
|
# token registry if the entity is a leaf.
|
||||||
# Unsets the parent node's value; in order
|
#
|
||||||
# to keep the tree clean, only the leaf
|
# @see Treat::Registrable
|
||||||
# values are stored.
|
|
||||||
#
|
|
||||||
# Takes in a single entity or an array of
|
|
||||||
# entities. Returns the first child supplied.
|
|
||||||
# If a string is
|
|
||||||
def <<(entities, clear_parent = true)
|
def <<(entities, clear_parent = true)
|
||||||
entities = (entities.is_a?(::String) ||
|
unless entities.is_a? Array
|
||||||
entities.is_a?(::Numeric)) ?
|
entities = [entities]
|
||||||
entities.to_entity : entities
|
end
|
||||||
entities = entities.is_a?(::Array) ?
|
entities.each do |entity|
|
||||||
entities : [entities]
|
register(entity)
|
||||||
# Register each entity in this node.
|
end
|
||||||
entities.each { |e| register(e) }
|
|
||||||
# Pass to the <<() method in Birch.
|
|
||||||
super(entities)
|
super(entities)
|
||||||
# Unset the parent value if necessary.
|
|
||||||
@parent.value = '' if has_parent?
|
@parent.value = '' if has_parent?
|
||||||
# Return the first child.
|
entities[0]
|
||||||
return entities[0]
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Catch missing methods to support method-like
|
# Catch missing methods to support method-like
|
||||||
# access to features (e.g. entity.category
|
# access to features (e.g. entity.category
|
||||||
# instead of entity.features[:category]) and to
|
# instead of entity.features[:category]) and to
|
||||||
|
@ -114,26 +102,29 @@ module Treat::Entities
|
||||||
# sugar for the #self.build method.
|
# sugar for the #self.build method.
|
||||||
def method_missing(sym, *args, &block)
|
def method_missing(sym, *args, &block)
|
||||||
return self.build(*args) if sym == nil
|
return self.build(*args) if sym == nil
|
||||||
return @features[sym] if @features.has_key?(sym)
|
|
||||||
result = magic(sym, *args, &block)
|
if !@features.has_key?(sym)
|
||||||
return result unless result == :no_magic
|
r = magic(sym, *args, &block)
|
||||||
begin; super(sym, *args, &block)
|
return r unless r == :no_magic
|
||||||
rescue NoMethodError; invalid_call(sym); end
|
begin
|
||||||
|
super(sym, *args, &block)
|
||||||
|
rescue NoMethodError
|
||||||
|
raise Treat::Exception,
|
||||||
|
if Treat::Workers.lookup(sym)
|
||||||
|
msg = "Method #{sym} cannot " +
|
||||||
|
"be called on a #{type}."
|
||||||
|
else
|
||||||
|
msg = "Method #{sym} does not exist."
|
||||||
|
msg += did_you_mean?(
|
||||||
|
Treat::Workers.methods, sym)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
else
|
||||||
|
@features[sym]
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Raises a Treat::Exception saying that the
|
|
||||||
# method called was invalid, and that the
|
|
||||||
# requested method does not exist. Also
|
|
||||||
# provides suggestions for misspellings.
|
|
||||||
def invalid_call(sym)
|
|
||||||
msg = Treat::Workers.lookup(sym) ?
|
|
||||||
"Method #{sym} can't be called on a #{type}." :
|
|
||||||
"Method #{sym} is not defined by Treat." +
|
|
||||||
Treat::Helpers::Help.did_you_mean?(
|
|
||||||
Treat::Workers.methods, sym)
|
|
||||||
raise Treat::Exception, msg
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,86 +0,0 @@
|
||||||
# When Treat.debug is set to true, each call to
|
|
||||||
# #call_worker will result in a debug message being
|
|
||||||
# printed by the #print_debug function.
|
|
||||||
module Treat::Entities::Entity::Debuggable
|
|
||||||
|
|
||||||
# Previous state and counter.
|
|
||||||
@@prev, @@i = nil, 0
|
|
||||||
|
|
||||||
# Explains what Treat is currently doing.
|
|
||||||
# Fixme: last call will never get shown.
|
|
||||||
def print_debug(entity, task, worker, group, options)
|
|
||||||
# Get a list of the worker's targets.
|
|
||||||
targets = group.targets.map(&:to_s)
|
|
||||||
|
|
||||||
# List the worker's targets as either
|
|
||||||
# a single target or an and/or form
|
|
||||||
# (since it would be too costly to
|
|
||||||
# actually determine what target types
|
|
||||||
# were processed at runtime for each call).
|
|
||||||
t = targets.size == 1 ? targets[0] : targets[
|
|
||||||
0..-2].join(', ') + ' and/or ' + targets[-1]
|
|
||||||
|
|
||||||
# Add genitive for annotations (sing./plural)
|
|
||||||
genitive = targets.size > 1 ? 'their' : 'its'
|
|
||||||
|
|
||||||
# Set up an empty string and humanize task name.
|
|
||||||
doing, human_task = '', task.to_s.gsub('_', ' ')
|
|
||||||
|
|
||||||
# Base is "{task}-ed {a(n)|N} {target(s)}"
|
|
||||||
if [:transformer, :computer].include?(group.type)
|
|
||||||
tt = human_task
|
|
||||||
tt = tt[0..-2] if tt[-1] == 'e'
|
|
||||||
ed = tt[-1] == 'd' ? '' : 'ed'
|
|
||||||
doing = "#{tt.capitalize}#{ed} #{t}"
|
|
||||||
# Base is "Annotated {a(n)|N} {target(s)}"
|
|
||||||
elsif group.type == :annotator
|
|
||||||
if group.preset_option
|
|
||||||
opt = options[group.preset_option]
|
|
||||||
form = opt.to_s.gsub('_', ' ')
|
|
||||||
human_task[-1] = ''
|
|
||||||
human_task = form + ' ' + human_task
|
|
||||||
end
|
|
||||||
doing = "Annotated #{t} with " +
|
|
||||||
"#{genitive} #{human_task}"
|
|
||||||
end
|
|
||||||
|
|
||||||
# Form is '{base} in format {worker}'.
|
|
||||||
if group.to_s.index('Formatters')
|
|
||||||
curr = doing + ' in format ' + worker.to_s
|
|
||||||
# Form is '{base} using {worker}'.
|
|
||||||
else
|
|
||||||
curr = doing + ' using ' + worker.to_s.gsub('_', ' ')
|
|
||||||
end
|
|
||||||
|
|
||||||
# Remove any double pluralization that may happen.
|
|
||||||
curr.gsub!('ss', 's') unless curr.index('class')
|
|
||||||
|
|
||||||
# Accumulate repeated tasks.
|
|
||||||
@@i += 1 if curr == @@prev
|
|
||||||
|
|
||||||
# Change tasks, so output.
|
|
||||||
if curr != @@prev && @@prev
|
|
||||||
# Pluralize entity names if necessary.
|
|
||||||
if @@i > 1
|
|
||||||
Treat.core.entities.list.each do |e|
|
|
||||||
@@prev.gsub!(e.to_s, e.to_s + 's')
|
|
||||||
end
|
|
||||||
@@prev.gsub!('its', 'their')
|
|
||||||
@@prev = @@prev.split(' ').
|
|
||||||
insert(1, @@i.to_s).join(' ')
|
|
||||||
# Add determiner if singular.
|
|
||||||
else
|
|
||||||
@@prev = @@prev.split(' ').
|
|
||||||
insert(1, 'a').join(' ')
|
|
||||||
end
|
|
||||||
# Reset counter.
|
|
||||||
@@i = 0
|
|
||||||
# Write to stdout.
|
|
||||||
puts @@prev + '.'
|
|
||||||
end
|
|
||||||
|
|
||||||
@@prev = curr
|
|
||||||
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -1,36 +0,0 @@
|
||||||
# Registers the entities ocurring in the subtree of
|
|
||||||
# a node as children are added. Also registers text
|
|
||||||
# occurrences for word groups and tokens (n grams).
|
|
||||||
module Treat::Entities::Entity::Registrable
|
|
||||||
|
|
||||||
# Registers a token or phrase in the registry.
|
|
||||||
# The registry keeps track of children by id,
|
|
||||||
# by entity type, and also keeps the position
|
|
||||||
# of the entity in its parent entity.
|
|
||||||
def register(entity)
|
|
||||||
unless @registry
|
|
||||||
@count, @registry = 0,
|
|
||||||
{id: {}, value: {}, position:{}, type: {}}
|
|
||||||
end
|
|
||||||
if entity.is_a?(Treat::Entities::Token) ||
|
|
||||||
entity.is_a?(Treat::Entities::Group)
|
|
||||||
val = entity.to_s.downcase
|
|
||||||
@registry[:value][val] ||= 0
|
|
||||||
@registry[:value][val] += 1
|
|
||||||
end
|
|
||||||
@registry[:id][entity.id] = true
|
|
||||||
@registry[:type][entity.type] ||= 0
|
|
||||||
@registry[:type][entity.type] += 1
|
|
||||||
@registry[:position][entity.id] = @count
|
|
||||||
@count += 1
|
|
||||||
@parent.register(entity) if has_parent?
|
|
||||||
end
|
|
||||||
|
|
||||||
# Backtrack up the tree to find a token registry,
|
|
||||||
# by default the one in the root node of the tree.
|
|
||||||
def registry(type = nil)
|
|
||||||
(has_parent? && type != self.type) ?
|
|
||||||
@parent.registry(type) : @registry
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
module Treat::Entities
|
||||||
|
|
||||||
|
# Represents a group of tokens.
|
||||||
|
class Group < Entity; end
|
||||||
|
|
||||||
|
# Represents a group of words
|
||||||
|
# with a sentence ender (.!?)
|
||||||
|
class Sentence < Group; end
|
||||||
|
|
||||||
|
# Represents a group of words,
|
||||||
|
# with no sentence ender.
|
||||||
|
class Phrase < Group; end
|
||||||
|
|
||||||
|
# Represents a non-linguistic
|
||||||
|
# fragment (e.g. stray symbols).
|
||||||
|
class Fragment < Group; end
|
||||||
|
|
||||||
|
end
|
|
@ -0,0 +1,13 @@
|
||||||
|
module Treat::Entities
|
||||||
|
# Represents a section.
|
||||||
|
class Section < Entity; end
|
||||||
|
|
||||||
|
# Represents a page of text.
|
||||||
|
class Page < Section; end
|
||||||
|
|
||||||
|
# Represents a block of text
|
||||||
|
class Block < Section; end
|
||||||
|
|
||||||
|
# Represents a list.
|
||||||
|
class List < Section; end
|
||||||
|
end
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue