-
Notifications
You must be signed in to change notification settings - Fork 46
/
remove_sense_suffixes_from_lemmas.pl
executable file
·51 lines (49 loc) · 1.35 KB
/
remove_sense_suffixes_from_lemmas.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env perl
# Normalizes lemmas in UD_Latin-Perseus. (The same has been done to UD_Czech-PDT and others.)
# Copyright © 2018 Dan Zeman <zeman@ufal.mff.cuni.cz>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
while(<>)
{
if(m/^\d+\t/)
{
s/\r?\n$//;
my @f = split(/\t/, $_);
my $form = $f[1];
my $lemma = $f[2];
my @misc;
unless($f[9] eq '_')
{
@misc = split(/\|/, $f[9]);
}
# Lemma of punctuation symbols should be the symbols themselves, as in most other treebanks.
if($form =~ m/^\pP+$/ && $lemma =~ m/\PP/)
{
$f[2] = $form;
@misc = grep {!m/^LId=/} (@misc);
push(@misc, "LId=$lemma");
}
# Lemma should not contain a numerical suffix that disambiguates word senses.
# Such disambiguation, if desired, should go to the LId attribute in MISC.
elsif($form !~ m/\d/ && $lemma =~ m/(.*\D)-?\d+$/)
{
$f[2] = $1;
@misc = grep {!m/^LId=/} (@misc);
push(@misc, "LId=$lemma");
}
if(scalar(@misc) >= 1)
{
$f[9] = join('|', @misc);
}
else
{
$f[9] = '_';
}
$_ = join("\t", @f)."\n";
}
print;
}