-
Notifications
You must be signed in to change notification settings - Fork 0
/
en_complet.py
64 lines (52 loc) · 2.53 KB
/
en_complet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import crf_absa16
import sys
import run
import data
import treetaggerwrapper
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import templates
PARTS, WORKDIR = run.init(sys.argv)
# Pipeline description begins here !
TRAIN_PATH = data.restaurants_en_2015
TEST_PATH = data.restaurants_en_test_2015
GOLD_PATH = data.restaurants_en_gold_2015
BUILDS = [('en_all', templates.en_all),
('en_all_old', templates.en_all_old),
('en_all-form', templates.en_all_form),
('en_all-lex', templates.en_all_lex),
('en_all-morph', templates.en_all_morph),
('en_morph', templates.en_morph),
('en_form', templates.en_form),
('en_morph_lex', templates.en_morph_lex)]
NOT_FOUND = None
FOUND = None
tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
# tokenizer = crf_absa16.TreeTaggerTokenizer(tagger)
tokenizer = TweetTokenizer()
FEATURES = [
['word_shape', crf_absa16.f_word_shape, ['WORD_SHAPE'], 'w', dict()],
['stop_words', crf_absa16.f_stopwords, ['STOP_WORDS'], 'w', dict(stopwords=[])],
# ['stop_words', crf_absa16.f_stopwords, ['STOP_WORDS'], 'w', dict(stopwords=stopwords.words('english'))],
['senna', crf_absa16.f_senna, ['POS', 'CHK', 'NER'], 'tokens', dict()],
['lemme', crf_absa16.f_treetagger_lemme, ['LEMME'], 'w', dict(tagger=tagger)],
['bing_liu', crf_absa16.f_lexicon, ['BING_LIU'], 'w', dict(lexicon=data.bing_liu_lexicon,
not_found=NOT_FOUND,
found=FOUND)],
['mpqa', crf_absa16.f_lexicon, ['MPQA'], 'w', dict(lexicon=data.mpqa_lexicon,
not_found=NOT_FOUND,
found=FOUND)],
['mpqa_plus', crf_absa16.f_lexicon_multi_words, ['MPQA_PLUS'], 'w', dict(lexicon=data.mpqa_plus_lexicon,
not_found=NOT_FOUND,
found=FOUND)],
]
FEATURES = [['w', crf_absa16.f_none, ['WORD'], 'w', dict()]] + FEATURES
FEATURES = FEATURES + [['c', crf_absa16.f_none, ['CLASS'], 'c', dict()]]
SHUFFLE = False
# Pipeline description ends here !
run.full_pipeline_with_fold(PARTS, WORKDIR,
BUILDS, FEATURES, SHUFFLE,
TRAIN_PATH, TEST_PATH, GOLD_PATH,
tokenizer=tokenizer)