forked from andreasvc/activedop
-
Notifications
You must be signed in to change notification settings - Fork 1
/
cgelbank2-punct.prm
67 lines (61 loc) · 2.65 KB
/
cgelbank2-punct.prm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
stages=[
dict(
name='dop', mode='plcfrs', dop='doubledop',
m=1000, estimator='rfe', objective = 'mpp',
),
],
evalparam='../disco-dop/proper.prm', # EVALB-style parameter file
corpusfmt='export', # choices: export, bracket, discbracket, alpino, tiger
traincorpus=dict(
path='cgelbank2-punct.export',
encoding='utf-8',
maxwords=100, # max number of words for sentences in train corpus
numsents=9999, # length (sents) of training corpus
),
testcorpus=dict(
path='cgelbank2-punct.export',
encoding='utf-8',
maxwords=100, # max number of words for sentences in test corpus
numsents=9999, # (max) number of test sentences to parse
skiptrain=False, # when the train & test set are read from the same file,
# enable this to skip the training sentences to get to the test set.
skip=0, # skip (additional) sentences between train & test set
),
punct='move', # options:
functions=None, # options:
morphology=None, # options:
ensureroot='ROOT',
# postagging: pass None to use tags from treebank.
postagging=dict(
# choices: unknownword (assign during parsing),
# treetagger, stanford (external taggers)
method='unknownword',
# choices unknownword: 4, 6, base,
# for treetagger / stanford: [filename of external tagger model]
model='4',
# options for unknown word models:
unknownthreshold=1, # use probs of rare words for unknown words
openclassthreshold=50, # add unseen tags for known words. 0 to disable.
),
# binarization options
binarization=dict(
method='default', # choices: default, optimal, optimalhead
factor='right', # right factored binarization
# (applicable for non-optimal binarizations)
# headrules='alpino.headrules', # file with rules for head assignment
h=1, # horizontal Markovization: number of siblings of context
v=1, # vertical Markovization; v=1 means no additional parent annotation.
revh=0, # horizontal Markovization: number of siblings of preceding context
pospa=False, # when v > 1, add parent annotation to POS tags?
markhead=True, # prepend label of head node to siblings
leftmostunary=False, # start binarization with unary node
rightmostunary=False, # end binarization with unary node
tailmarker='', # symbol to add to last node in a binarization, to mark head node
revmarkov=False, # reverse order for horizontal Markovization
fanout_marks_before_bin=False, # whether to add fanout markers before
# binarization, to distinguish them for markovization,
# e.g., VP|<NP_2-VVFIN> instead of VP|<NP-VVFIN>
),
# misc
verbosity=2, # 0=silent; 1=summary report; 2=per sentence results; 3=dump derivations/parse trees.
numproc=1, # increase to use multiple CPUs. Set to None to use all CPUs.