-
Notifications
You must be signed in to change notification settings - Fork 46
/
validate.py
executable file
·3356 lines (3236 loc) · 191 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/env python3
# Original code (2015) by Filip Ginter and Sampo Pyysalo.
# DZ 2018-11-04: Porting the validator to Python 3.
# DZ: Many subsequent changes. See the git history.
import sys
import io
import os.path
import argparse
import traceback
# According to https://stackoverflow.com/questions/1832893/python-regex-matching-unicode-properties,
# the regex module has the same API as re but it can check Unicode character properties using \p{}
# as in Perl.
#import re
import regex as re
import unicodedata
import json
# The folder where this script resides.
THISDIR=os.path.dirname(os.path.realpath(os.path.abspath(__file__)))
# Constants for the column indices
COLCOUNT=10
ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC=range(COLCOUNT)
COLNAMES='ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC'.split(',')
TOKENSWSPACE=MISC+1 # one extra constant
AUX=MISC+2 # another extra constant
COP=MISC+3 # another extra constant
# Global variables:
curr_line = 0 # Current line in the input file
comment_start_line = 0 # The line in the input file on which the current sentence starts, including sentence-level comments.
sentence_line = 0 # The line in the input file on which the current sentence starts (the first node/token line, skipping comments)
sentence_id = None # The most recently read sentence id
line_of_first_morpho_feature = None # features are optional, but if the treebank has features, then some become required
delayed_feature_errors = {}
line_of_first_enhanced_graph = None
line_of_first_tree_without_enhanced_graph = None
line_of_first_enhancement = None # any difference between non-empty DEPS and HEAD:DEPREL
line_of_first_empty_node = None
line_of_first_enhanced_orphan = None
line_of_global_entity = None
global_entity_attribute_string = None # to be able to check that repeated declarations are identical
entity_attribute_number = 0 # to be able to check that an entity does not have extra attributes
entity_attribute_index = {} # key: entity attribute name; value: the index of the attribute in the entity attribute list
entity_types = {} # key: entity (cluster) id; value: tuple: (type of the entity, identity (Wikipedia etc.), line of the first mention)
open_entity_mentions = [] # items are dictionaries with entity mention information
open_discontinuous_mentions = {} # key: entity id; describes last part of a discontinuous mention of that entity; item is dict, its keys: last_ipart, npart, line
entity_ids_this_document = {}
entity_ids_other_documents = {}
entity_bridge_relations = {} # key: srceid<tgteid pair; value: type of the entity (may be empty)
entity_split_antecedents = {} # key: tgteid; value: sorted list of srceids, serialized to string
entity_mention_spans = {} # key: [eid][sentid][str(mention_span)]; value: set of node ids
error_counter = {} # key: error type value: error count
warn_on_missing_files = set() # langspec files which you should warn about in case they are missing (can be deprel, edeprel, feat_val, tokens_w_space)
warn_on_undoc_feats = '' # filled after reading docfeats.json; printed when an unknown feature is encountered in the data
warn_on_undoc_deps = '' # filled after reading docdeps.json; printed when an unknown relation is encountered in the data
warn_on_undoc_edeps = '' # filled after reading edeprels.json; printed when an unknown enhanced relation is encountered in the data
mwt_typo_span_end = None # if Typo=Yes at multiword token, what is the end of the multiword span?
spaceafterno_in_effect = False # needed to check that no space after last word of sentence does not co-occur with new paragraph or document
featdata = {} # key: language code (feature-value-UPOS data loaded from feats.json)
auxdata = {} # key: language code (auxiliary/copula data loaded from data.json)
depreldata = {} # key: language code (deprel data loaded from deprels.json)
edepreldata = {} # key: language code (edeprel data loaded from edeprels.json)
def warn(msg, testclass, testlevel, testid, lineno=0, nodeid=0, explanation=None):
"""
Print the error/warning message.
If lineno is 0, print the number of the current line (most recently read from input).
If lineno is < 0, print the number of the first line of the current sentence.
If lineno is > 0, print lineno (probably pointing somewhere in the current sentence).
If explanation contains a string and this is the first time we are reporting
an error of this type, the string will be appended to the main message. It
can be used as an extended explanation of the situation.
"""
global curr_fname, curr_line, sentence_line, sentence_id, error_counter, args
error_counter[testclass] = error_counter.get(testclass, 0)+1
if args.max_err > 0 and error_counter[testclass] > args.max_err:
if error_counter[testclass] == args.max_err + 1:
print(('...suppressing further errors regarding ' + testclass), file=sys.stderr)
pass # supressed
elif not args.quiet:
if explanation and error_counter[testclass] == 1:
msg += ' ' + explanation
if len(args.input) > 1: # several files, should report which one
if curr_fname=='-':
fn = '(in STDIN) '
else:
fn = '(in '+os.path.basename(curr_fname)+') '
else:
fn = ''
sent = ''
node = ''
# Global variable (last read sentence id): sentence_id
# Originally we used a parameter sid but we probably do not need to override the global value.
if sentence_id:
sent = ' Sent ' + sentence_id
if nodeid:
node = ' Node ' + str(nodeid)
if lineno > 0:
print(f"[{fn}Line {lineno}{sent}{node}]: [L{testlevel} {testclass} {testid}] {msg}",
file=sys.stderr)
elif lineno < 0:
print(f"[{fn}Line {sentence_line}{sent}{node}]: [L{testlevel} {testclass} {testid}] {msg}",
file=sys.stderr)
else:
print(f"[{fn}Line {curr_line}{sent}{node}]: [L{testlevel} {testclass} {testid}] {msg}",
file=sys.stderr)
###### Support functions
ws_re = re.compile(r"^\s+$")
def is_whitespace(line):
return ws_re.match(line)
word_re = re.compile(r"^[1-9][0-9]*$")
def is_word(cols):
return word_re.match(cols[ID])
mwt_re = re.compile(r"^[1-9][0-9]*-[1-9][0-9]*$")
def is_multiword_token(cols):
return mwt_re.match(cols[ID])
empty_node_re = re.compile(r"^[0-9]+\.[1-9][0-9]*$")
def is_empty_node(cols):
return empty_node_re.match(cols[ID])
def parse_empty_node_id(cols):
m = re.match(r"^([0-9]+)\.([0-9]+)$", cols[ID])
assert m, 'parse_empty_node_id with non-empty node'
return m.groups()
def shorten(string):
return string if len(string) < 25 else string[:20]+'[...]'
def lspec2ud(deprel):
return deprel.split(':', 1)[0]
#==============================================================================
# Level 1 tests. Only CoNLL-U backbone. Values can be empty or non-UD.
#==============================================================================
sentid_re=re.compile(r"^# sent_id\s*=\s*(\S+)$")
def trees(inp, tag_sets, args):
"""
`inp` a file-like object yielding lines as unicode
`tag_sets` and `args` are needed for choosing the tests
This function does elementary checking of the input and yields one
sentence at a time from the input stream.
This function is a generator. The caller can call it in a 'for x in ...'
loop. In each iteration of the caller's loop, the generator will generate
the next sentence, that is, it will read the next sentence from the input
stream. (Technically, the function returns an object, and the object will
then read the sentences within the caller's loop.)
"""
global curr_line, comment_start_line, sentence_line, sentence_id
comments = [] # List of comment lines to go with the current sentence
lines = [] # List of token/word lines of the current sentence
corrupted = False # In case of wrong number of columns check the remaining lines of the sentence but do not yield the sentence for further processing.
comment_start_line = None
testlevel = 1
testclass = 'Format'
for line_counter, line in enumerate(inp):
curr_line = line_counter+1
if not comment_start_line:
comment_start_line = curr_line
line = line.rstrip("\n")
if is_whitespace(line):
testid = 'pseudo-empty-line'
testmessage = 'Spurious line that appears empty but is not; there are whitespace characters.'
warn(testmessage, testclass, testlevel, testid)
# We will pretend that the line terminates a sentence in order to
# avoid subsequent misleading error messages.
if lines:
if not corrupted:
yield comments, lines
comments = []
lines = []
corrupted = False
comment_start_line = None
elif not line: # empty line
if lines: # sentence done
if not corrupted:
yield comments, lines
comments=[]
lines=[]
corrupted = False
comment_start_line = None
else:
testid = 'extra-empty-line'
testmessage = 'Spurious empty line. Only one empty line is expected after every sentence.'
warn(testmessage, testclass, testlevel, testid)
elif line[0]=='#':
# We will really validate sentence ids later. But now we want to remember
# everything that looks like a sentence id and use it in the error messages.
# Line numbers themselves may not be sufficient if we are reading multiple
# files from a pipe.
match = sentid_re.match(line)
if match:
sentence_id = match.group(1)
if not lines: # before sentence
comments.append(line)
else:
testid = 'misplaced-comment'
testmessage = 'Spurious comment line. Comments are only allowed before a sentence.'
warn(testmessage, testclass, testlevel, testid)
elif line[0].isdigit():
validate_unicode_normalization(line)
if not lines: # new sentence
sentence_line=curr_line
cols=line.split("\t")
if len(cols)!=COLCOUNT:
testid = 'number-of-columns'
testmessage = f'The line has {len(cols)} columns but {COLCOUNT} are expected. The contents of the columns will not be checked.'
warn(testmessage, testclass, testlevel, testid)
corrupted = True
# If there is an unexpected number of columns, do not test their contents.
# Maybe the contents belongs to a different column. And we could see
# an exception if a column value is missing.
else:
lines.append(cols)
validate_cols_level1(cols)
if args.level > 1:
validate_cols(cols, tag_sets, args)
else: # A line which is neither a comment nor a token/word, nor empty. That's bad!
testid = 'invalid-line'
testmessage = f"Spurious line: '{line}'. All non-empty lines should start with a digit or the # character."
warn(testmessage, testclass, testlevel, testid)
else: # end of file
if comments or lines: # These should have been yielded on an empty line!
testid = 'missing-empty-line'
testmessage = 'Missing empty line after the last sentence.'
warn(testmessage, testclass, testlevel, testid)
if not corrupted:
yield comments, lines
###### Tests applicable to a single row indpendently of the others
def validate_unicode_normalization(text):
"""
Tests that letters composed of multiple Unicode characters (such as a base
letter plus combining diacritics) conform to NFC normalization (canonical
decomposition followed by canonical composition).
"""
normalized_text = unicodedata.normalize('NFC', text)
if text != normalized_text:
# Find the first unmatched character and include it in the report.
firsti = -1
firstj = -1
inpfirst = ''
nfcfirst = ''
tcols = text.split("\t")
ncols = normalized_text.split("\t")
for i in range(len(tcols)):
for j in range(len(tcols[i])):
if tcols[i][j] != ncols[i][j]:
firsti = i
firstj = j
inpfirst = unicodedata.name(tcols[i][j])
nfcfirst = unicodedata.name(ncols[i][j])
break
if firsti >= 0:
break
testlevel = 1
testclass = 'Unicode'
testid = 'unicode-normalization'
testmessage = f"Unicode not normalized: {COLNAMES[firsti]}.character[{firstj}] is {inpfirst}, should be {nfcfirst}."
warn(testmessage, testclass, testlevel, testid)
whitespace_re = re.compile(r".*\s", re.U)
whitespace2_re = re.compile(r".*\s\s", re.U)
def validate_cols_level1(cols):
"""
Tests that can run on a single line and pertain only to the CoNLL-U file
format, not to predefined sets of UD tags.
"""
testlevel = 1
testclass = 'Format'
# Some whitespace may be permitted in FORM, LEMMA and MISC but not elsewhere.
for col_idx in range(MISC+1):
if col_idx >= len(cols):
break # this has been already reported in trees()
# Must never be empty
if not cols[col_idx]:
testid = 'empty-column'
testmessage = f'Empty value in column {COLNAMES[col_idx]}.'
warn(testmessage, testclass, testlevel, testid)
else:
# Must never have leading/trailing whitespace
if cols[col_idx][0].isspace():
testid = 'leading-whitespace'
testmessage = f'Leading whitespace not allowed in column {COLNAMES[col_idx]}.'
warn(testmessage, testclass, testlevel, testid)
if cols[col_idx][-1].isspace():
testid = 'trailing-whitespace'
testmessage = f'Trailing whitespace not allowed in column {COLNAMES[col_idx]}.'
warn(testmessage, testclass, testlevel, testid)
# Must never contain two consecutive whitespace characters
if whitespace2_re.match(cols[col_idx]):
testid = 'repeated-whitespace'
testmessage = f'Two or more consecutive whitespace characters not allowed in column {COLNAMES[col_idx]}.'
warn(testmessage, testclass, testlevel, testid)
# Multi-word tokens may have whitespaces in MISC but not in FORM or LEMMA.
# If it contains a space, it does not make sense to treat it as a MWT.
if is_multiword_token(cols):
for col_idx in (FORM, LEMMA):
if col_idx >= len(cols):
break # this has been already reported in trees()
if whitespace_re.match(cols[col_idx]):
testid = 'invalid-whitespace-mwt'
testmessage = f"White space not allowed in multi-word token '{cols[col_idx]}'. If it contains a space, it is not one surface token."
warn(testmessage, testclass, testlevel, testid)
# These columns must not have whitespace.
for col_idx in (ID, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS):
if col_idx >= len(cols):
break # this has been already reported in trees()
if whitespace_re.match(cols[col_idx]):
testid = 'invalid-whitespace'
testmessage = f"White space not allowed in column {COLNAMES[col_idx]}: '{cols[col_idx]}'."
warn(testmessage, testclass, testlevel, testid)
# Check for the format of the ID value. (ID must not be empty.)
if not (is_word(cols) or is_empty_node(cols) or is_multiword_token(cols)):
testid = 'invalid-word-id'
testmessage = f"Unexpected ID format '{cols[ID]}'."
warn(testmessage, testclass, testlevel, testid)
##### Tests applicable to the whole tree
interval_re = re.compile(r"^([0-9]+)-([0-9]+)$", re.U)
def validate_ID_sequence(tree):
"""
Validates that the ID sequence is correctly formed.
Besides issuing a warning if an error is found, it also returns False to
the caller so it can avoid building a tree from corrupt ids.
"""
ok = True
testlevel = 1
testclass = 'Format'
words=[]
tokens=[]
current_word_id, next_empty_id = 0, 1
for cols in tree:
if not is_empty_node(cols):
next_empty_id = 1 # reset sequence
if is_word(cols):
t_id = int(cols[ID])
current_word_id = t_id
words.append(t_id)
# Not covered by the previous interval?
if not (tokens and tokens[-1][0] <= t_id and tokens[-1][1] >= t_id):
tokens.append((t_id, t_id)) # nope - let's make a default interval for it
elif is_multiword_token(cols):
match = interval_re.match(cols[ID]) # Check the interval against the regex
if not match: # This should not happen. The function is_multiword_token() would then not return True.
testid = 'invalid-word-interval'
testmessage = f"Spurious word interval definition: '{cols[ID]}'."
warn(testmessage, testclass, testlevel, testid)
ok = False
continue
beg, end = int(match.group(1)), int(match.group(2))
if not ((not words and beg >= 1) or (words and beg >= words[-1] + 1)):
testid = 'misplaced-word-interval'
testmessage = 'Multiword range not before its first word.'
warn(testmessage, testclass, testlevel, testid)
ok = False
continue
tokens.append((beg, end))
elif is_empty_node(cols):
word_id, empty_id = (int(i) for i in parse_empty_node_id(cols))
if word_id != current_word_id or empty_id != next_empty_id:
testid = 'misplaced-empty-node'
testmessage = f'Empty node id {cols[ID]}, expected {current_word_id}.{next_empty_id}'
warn(testmessage, testclass, testlevel, testid)
ok = False
next_empty_id += 1
# Interaction of multiword tokens and empty nodes if there is an empty
# node between the first word of a multiword token and the previous word:
# This sequence is correct: 4 4.1 5-6 5 6
# This sequence is wrong: 4 5-6 4.1 5 6
if word_id == current_word_id and tokens and word_id < tokens[-1][0]:
testid = 'misplaced-empty-node'
testmessage = f"Empty node id {cols[ID]} must occur before multiword token {tokens[-1][0]}-{tokens[-1][1]}."
warn(testmessage, testclass, testlevel, testid)
ok = False
# Now let's do some basic sanity checks on the sequences.
# Expected sequence of word IDs is 1, 2, ...
expstrseq = ','.join(str(x) for x in range(1, len(words) + 1))
wrdstrseq = ','.join(str(x) for x in words)
if wrdstrseq != expstrseq:
testid = 'word-id-sequence'
testmessage = f"Words do not form a sequence. Got '{wrdstrseq}'. Expected '{expstrseq}'."
warn(testmessage, testclass, testlevel, testid, lineno=-1)
ok = False
# Check elementary sanity of word intervals.
# Remember that these are not just multi-word tokens. Here we have intervals even for single-word tokens (b=e)!
for (b, e) in tokens:
if e<b: # end before beginning
testid = 'reversed-word-interval'
testmessage = f'Spurious token interval {b}-{e}'
warn(testmessage, testclass, testlevel, testid)
ok = False
continue
if b<1 or e>len(words): # out of range
testid = 'word-interval-out'
testmessage = f'Spurious token interval {b}-{e} (out of range)'
warn(testmessage, testclass, testlevel, testid)
ok = False
continue
return ok
def validate_token_ranges(tree):
"""
Checks that the word ranges for multiword tokens are valid.
"""
testlevel = 1
testclass = 'Format'
covered = set()
for cols in tree:
if not is_multiword_token(cols):
continue
m = interval_re.match(cols[ID])
if not m: # This should not happen. The function is_multiword_token() would then not return True.
testid = 'invalid-word-interval'
testmessage = f"Spurious word interval definition: '{cols[ID]}'."
warn(testmessage, testclass, testlevel, testid)
continue
start, end = m.groups()
try:
start, end = int(start), int(end)
except ValueError:
assert False, 'internal error' # RE should assure that this works
if start >= end: ###!!! This was already tested above in validate_ID_sequence()! Should we remove it from there?
testid = 'reversed-word-interval'
testmessage = f'Spurious token interval {start}-{end}'
warn(testmessage, testclass, testlevel, testid)
continue
if covered & set(range(start, end+1)):
testid = 'overlapping-word-intervals'
testmessage = f'Range overlaps with others: {cols[ID]}'
warn(testmessage, testclass, testlevel, testid)
covered |= set(range(start, end+1))
def validate_newlines(inp):
if inp.newlines and inp.newlines != '\n':
testlevel = 1
testclass = 'Format'
testid = 'non-unix-newline'
testmessage = 'Only the unix-style LF line terminator is allowed.'
warn(testmessage, testclass, testlevel, testid)
#==============================================================================
# Level 2 tests. Tree structure, universal tags and deprels. Note that any
# well-formed Feature=Value pair is allowed (because it could be language-
# specific) and any word form or lemma can contain spaces (because language-
# specific guidelines may permit it).
#==============================================================================
###### Metadata tests #########
def validate_sent_id(comments, known_ids, lcode):
testlevel = 2
testclass = 'Metadata'
matched=[]
for c in comments:
match=sentid_re.match(c)
if match:
matched.append(match)
else:
if c.startswith('# sent_id') or c.startswith('#sent_id'):
testid = 'invalid-sent-id'
testmessage = f"Spurious sent_id line: '{c}' should look like '# sent_id = xxxxx' where xxxxx is not whitespace. Forward slash reserved for special purposes."
warn(testmessage, testclass, testlevel, testid)
if not matched:
testid = 'missing-sent-id'
testmessage = 'Missing the sent_id attribute.'
warn(testmessage, testclass, testlevel, testid)
elif len(matched)>1:
testid = 'multiple-sent-id'
testmessage = 'Multiple sent_id attributes.'
warn(testmessage, testclass, testlevel, testid)
else:
# Uniqueness of sentence ids should be tested treebank-wide, not just file-wide.
# For that to happen, all three files should be tested at once.
sid=matched[0].group(1)
if sid in known_ids:
testid = 'non-unique-sent-id'
testmessage = f"Non-unique sent_id attribute '{sid}'."
warn(testmessage, testclass, testlevel, testid)
if sid.count("/")>1 or (sid.count("/")==1 and lcode!="ud" and lcode!="shopen"):
testid = 'slash-in-sent-id'
testmessage = f"The forward slash is reserved for special use in parallel treebanks: '{sid}'"
warn(testmessage, testclass, testlevel, testid)
known_ids.add(sid)
newdoc_re = re.compile(r"^#\s*newdoc(\s|$)")
newpar_re = re.compile(r"^#\s*newpar(\s|$)")
text_re = re.compile(r"^#\s*text\s*=\s*(.+)$")
def validate_text_meta(comments, tree, args):
# Remember if SpaceAfter=No applies to the last word of the sentence.
# This is not prohibited in general but it is prohibited at the end of a paragraph or document.
global spaceafterno_in_effect
# In trees(), sentence_line was already moved to the first token/node line
# after the sentence comment lines. While this is useful in most validation
# functions, it complicates things here where we also work with the comments.
# warn(lineno=-1) will print the sentence_line, i.e., after the comments.
# warn() without lineno will refer to the empty line after the sentence.
global sentence_line
testlevel = 2
testclass = 'Metadata'
newdoc_matched = []
newpar_matched = []
text_matched = []
for c in comments:
newdoc_match = newdoc_re.match(c)
if newdoc_match:
newdoc_matched.append(newdoc_match)
newpar_match = newpar_re.match(c)
if newpar_match:
newpar_matched.append(newpar_match)
text_match = text_re.match(c)
if text_match:
text_matched.append(text_match)
if len(newdoc_matched) > 1:
testid = 'multiple-newdoc'
testmessage = 'Multiple newdoc attributes.'
warn(testmessage, testclass, testlevel, testid, lineno=-1)
if len(newpar_matched) > 1:
testid = 'multiple-newpar'
testmessage = 'Multiple newpar attributes.'
warn(testmessage, testclass, testlevel, testid, lineno=-1)
if (newdoc_matched or newpar_matched) and spaceafterno_in_effect:
testid = 'spaceafter-newdocpar'
testmessage = 'New document or paragraph starts when the last token of the previous sentence says SpaceAfter=No.'
warn(testmessage, testclass, testlevel, testid, lineno=-1)
if not text_matched:
testid = 'missing-text'
testmessage = 'Missing the text attribute.'
warn(testmessage, testclass, testlevel, testid, lineno=-1)
elif len(text_matched) > 1:
testid = 'multiple-text'
testmessage = 'Multiple text attributes.'
warn(testmessage, testclass, testlevel, testid, lineno=-1)
else:
stext = text_matched[0].group(1)
if stext[-1].isspace():
testid = 'text-trailing-whitespace'
testmessage = 'The text attribute must not end with whitespace.'
warn(testmessage, testclass, testlevel, testid, lineno=-1)
# Validate the text against the SpaceAfter attribute in MISC.
skip_words = set()
mismatch_reported = 0 # do not report multiple mismatches in the same sentence; they usually have the same cause
# We will sum sentence_line + iline, and sentence_line already points at
# the first token/node line after the sentence comments. Hence iline shall
# be 0 once we enter the cycle.
iline = -1
for cols in tree:
iline += 1
if MISC >= len(cols):
# This error has been reported elsewhere but we cannot check MISC now.
continue
if 'NoSpaceAfter=Yes' in cols[MISC]: # I leave this without the split("|") to catch all
testid = 'nospaceafter-yes'
testmessage = "'NoSpaceAfter=Yes' should be replaced with 'SpaceAfter=No'."
warn(testmessage, testclass, testlevel, testid, lineno=sentence_line+iline)
if len([x for x in cols[MISC].split('|') if re.match(r"^SpaceAfter=", x) and x != 'SpaceAfter=No']) > 0:
testid = 'spaceafter-value'
testmessage = "Unexpected value of the 'SpaceAfter' attribute in MISC. Did you mean 'SpacesAfter'?"
warn(testmessage, testclass, testlevel, testid, lineno=sentence_line+iline)
if '.' in cols[ID]: # empty node
if 'SpaceAfter=No' in cols[MISC]: # I leave this without the split("|") to catch all
testid = 'spaceafter-empty-node'
testmessage = "'SpaceAfter=No' cannot occur with empty nodes."
warn(testmessage, testclass, testlevel, testid, lineno=sentence_line+iline)
continue
elif '-' in cols[ID]: # multi-word token
beg,end=cols[ID].split('-')
try:
begi,endi = int(beg),int(end)
except ValueError as e:
# This error has been reported elsewhere.
begi,endi = 1,0
# If we see a multi-word token, add its words to an ignore-set - these will be skipped, and also checked for absence of SpaceAfter=No
for i in range(begi, endi+1):
skip_words.add(str(i))
elif cols[ID] in skip_words:
if 'SpaceAfter=No' in cols[MISC]:
testid = 'spaceafter-mwt-node'
testmessage = "'SpaceAfter=No' cannot occur with words that are part of a multi-word token."
warn(testmessage, testclass, testlevel, testid, lineno=sentence_line+iline)
continue
else:
# Err, I guess we have nothing to do here. :)
pass
# So now we have either a multi-word token or a word which is also a token in its entirety.
if not stext.startswith(cols[FORM]):
if not mismatch_reported:
testid = 'text-form-mismatch'
testmessage = f"Mismatch between the text attribute and the FORM field. Form[{cols[ID]}] is '{cols[FORM]}' but text is '{stext[:len(cols[FORM])+20]}...'"
if len(stext) >= 1 and stext[0].isspace():
testmessage += " (perhaps extra SpaceAfter=No at previous token?)"
warn(testmessage, testclass, testlevel, testid, lineno=sentence_line+iline)
mismatch_reported = 1
else:
stext = stext[len(cols[FORM]):] # eat the form
if 'SpaceAfter=No' in cols[MISC].split("|"):
spaceafterno_in_effect = True
else:
spaceafterno_in_effect = False
if args.check_space_after and (stext) and not stext[0].isspace():
testid = 'missing-spaceafter'
testmessage = f"'SpaceAfter=No' is missing in the MISC field of node #%{cols[ID]} because the text is '{shorten(cols[FORM]+stext)}'."
warn(testmessage, testclass, testlevel, testid, lineno=sentence_line+iline)
stext = stext.lstrip()
if stext:
testid = 'text-extra-chars'
testmessage = f"Extra characters at the end of the text attribute, not accounted for in the FORM fields: '{stext}'"
warn(testmessage, testclass, testlevel, testid)
##### Tests applicable to a single row indpendently of the others
def validate_cols(cols, tag_sets, args):
"""
All tests that can run on a single line. Done as soon as the line is read,
called from trees() if level>1.
"""
if is_word(cols) or is_empty_node(cols):
validate_character_constraints(cols) # level 2
validate_upos(cols, tag_sets) # level 2
validate_features(cols, tag_sets, args) # level 2 and up (relevant code checks whether higher level is required)
elif is_multiword_token(cols):
validate_token_empty_vals(cols)
# else do nothing; we have already reported wrong ID format at level 1
if is_word(cols):
validate_deprels(cols, tag_sets, args) # level 2 and up
elif is_empty_node(cols):
validate_empty_node_empty_vals(cols) # level 2
if args.level > 3:
validate_whitespace(cols, tag_sets) # level 4 (it is language-specific; to disallow everywhere, use --lang ud)
def validate_token_empty_vals(cols):
"""
Checks that a multi-word token has _ empty values in all fields except MISC.
This is required by UD guidelines although it is not a problem in general,
therefore a level 2 test.
"""
global mwt_typo_span_end
assert is_multiword_token(cols), 'internal error'
for col_idx in range(LEMMA, MISC): # all columns except the first two (ID, FORM) and the last one (MISC)
# Exception: The feature Typo=Yes may occur in FEATS of a multi-word token.
if col_idx == FEATS and cols[col_idx] == 'Typo=Yes':
# If a multi-word token has Typo=Yes, its component words must not have it.
# We must remember the span of the MWT and check it in validate_features().
m = interval_re.match(cols[ID])
mwt_typo_span_end = m.group(2)
elif cols[col_idx] != '_':
testlevel = 2
testclass = 'Format'
testid = 'mwt-nonempty-field'
testmessage = f"A multi-word token line must have '_' in the column {COLNAMES[col_idx]}. Now: '{cols[col_idx]}'."
warn(testmessage, testclass, testlevel, testid)
def validate_empty_node_empty_vals(cols):
"""
Checks that an empty node has _ empty values in HEAD and DEPREL. This is
required by UD guidelines but not necessarily by CoNLL-U, therefore
a level 2 test.
"""
assert is_empty_node(cols), 'internal error'
for col_idx in (HEAD, DEPREL):
if cols[col_idx]!= '_':
testlevel = 2
testclass = 'Format'
testid = 'mwt-nonempty-field'
testmessage = f"An empty node must have '_' in the column {COLNAMES[col_idx]}. Now: '{cols[col_idx]}'."
warn(testmessage, testclass, testlevel, testid)
# Ll ... lowercase Unicode letters
# Lm ... modifier Unicode letters (e.g., superscript h)
# Lo ... other Unicode letters (all caseless scripts, e.g., Arabic)
# M .... combining diacritical marks
# Underscore is allowed between letters but not at beginning, end, or next to another underscore.
edeprelpart_resrc = r'[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*'
# There must be always the universal part, consisting only of ASCII letters.
# There can be up to three additional, colon-separated parts: subtype, preposition and case.
# One of them, the preposition, may contain Unicode letters. We do not know which one it is
# (only if there are all four parts, we know it is the third one).
# ^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$
edeprel_resrc = '^[a-z]+(:[a-z]+)?(:' + edeprelpart_resrc + ')?(:[a-z]+)?$'
edeprel_re = re.compile(edeprel_resrc)
deprel_re = re.compile(r"^[a-z]+(:[a-z]+)?$")
upos_re = re.compile(r"^[A-Z]+$")
def validate_character_constraints(cols):
"""
Checks general constraints on valid characters, e.g. that UPOS
only contains [A-Z].
"""
testlevel = 2
if is_multiword_token(cols):
return
if UPOS >= len(cols):
return # this has been already reported in trees()
if not (upos_re.match(cols[UPOS]) or (is_empty_node(cols) and cols[UPOS] == '_')):
testclass = 'Morpho'
testid = 'invalid-upos'
testmessage = f"Invalid UPOS value '{cols[UPOS]}'."
warn(testmessage, testclass, testlevel, testid)
if not (deprel_re.match(cols[DEPREL]) or (is_empty_node(cols) and cols[DEPREL] == '_')):
testclass = 'Syntax'
testid = 'invalid-deprel'
testmessage = f"Invalid DEPREL value '{cols[DEPREL]}'."
warn(testmessage, testclass, testlevel, testid)
try:
deps_list(cols)
except ValueError:
testclass = 'Enhanced'
testid = 'invalid-deps'
testmessage = f"Failed to parse DEPS: '{cols[DEPS]}'."
warn(testmessage, testclass, testlevel, testid)
return
if any(deprel for head, deprel in deps_list(cols)
if not edeprel_re.match(deprel)):
testclass = 'Enhanced'
testid = 'invalid-edeprel'
testmessage = f"Invalid enhanced relation type: '{cols[DEPS]}'."
warn(testmessage, testclass, testlevel, testid)
attr_val_re=re.compile(r"^([A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?)=(([A-Z0-9][A-Z0-9a-z]*)(,([A-Z0-9][A-Z0-9a-z]*))*)$")
val_re=re.compile(r"^[A-Z0-9][A-Za-z0-9]*")
def validate_features(cols, tag_sets, args):
"""
Checks general constraints on feature-value format. On level 4 and higher,
also checks that a feature-value pair is listed as approved. (Every pair
must be allowed on level 2 because it could be defined as language-specific.
To disallow non-universal features, test on level 4 with language 'ud'.)
"""
global warn_on_undoc_feats
global mwt_typo_span_end
testclass = 'Morpho'
if FEATS >= len(cols):
return # this has been already reported in trees()
feats = cols[FEATS]
if feats == '_':
return True
features_present()
# List of permited features is language-specific.
# The current token may be in a different language due to code switching.
lang = args.lang
featset = tag_sets[FEATS]
altlang = get_alt_language(cols[MISC])
if altlang:
lang = altlang
featset = get_featdata_for_language(altlang)
feat_list=feats.split('|')
if [f.lower() for f in feat_list] != sorted(f.lower() for f in feat_list):
testlevel = 2
testid = 'unsorted-features'
testmessage = f"Morphological features must be sorted: '{feats}'."
warn(testmessage, testclass, testlevel, testid)
attr_set = set() # I'll gather the set of features here to check later that none is repeated.
for f in feat_list:
match = attr_val_re.match(f)
if match is None:
testlevel = 2
testid = 'invalid-feature'
testmessage = f"Spurious morphological feature: '{f}'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]."
warn(testmessage, testclass, testlevel, testid)
attr_set.add(f) # to prevent misleading error "Repeated features are disallowed"
else:
# Check that the values are sorted as well
attr = match.group(1)
attr_set.add(attr)
values = match.group(2).split(',')
if len(values) != len(set(values)):
testlevel = 2
testid = 'repeated-feature-value'
testmessage = f"Repeated feature values are disallowed: '{feats}'"
warn(testmessage, testclass, testlevel, testid)
if [v.lower() for v in values] != sorted(v.lower() for v in values):
testlevel = 2
testid = 'unsorted-feature-values'
testmessage = f"If a feature has multiple values, these must be sorted: '{f}'"
warn(testmessage, testclass, testlevel, testid)
for v in values:
if not val_re.match(v):
testlevel = 2
testid = 'invalid-feature-value'
testmessage = f"Spurious value '{v}' in '{f}'. Must start with [A-Z0-9] and only contain [A-Za-z0-9]."
warn(testmessage, testclass, testlevel, testid)
# Level 2 tests character properties and canonical order but not that the f-v pair is known.
# Level 4 also checks whether the feature value is on the list.
# If only universal feature-value pairs are allowed, test on level 4 with lang='ud'.
if args.level > 3:
testlevel = 4
# The feature Typo=Yes is the only feature allowed on a multi-word token line.
# If it occurs there, it cannot be duplicated on the lines of the component words.
if attr == 'Typo' and mwt_typo_span_end and cols[ID] <= mwt_typo_span_end:
testid = 'mwt-typo-repeated-at-word'
testmessage = "Feature Typo cannot occur at a word if it already occurred at the corresponding multi-word token."
warn(testmessage, testclass, testlevel, testid)
# In case of code switching, the current token may not be in the default language
# and then its features are checked against a different feature set. An exception
# is the feature Foreign, which always relates to the default language of the
# corpus (but Foreign=Yes should probably be allowed for all UPOS categories in
# all languages).
effective_featset = featset
effective_lang = lang
if attr == 'Foreign':
# Revert to the default.
effective_featset = tag_sets[FEATS]
effective_lang = args.lang
if effective_featset is not None:
if attr not in effective_featset:
testid = 'feature-unknown'
testmessage = f"Feature {attr} is not documented for language [{effective_lang}]."
if not altlang and len(warn_on_undoc_feats) > 0:
# If some features were excluded because they are not documented,
# tell the user when the first unknown feature is encountered in the data.
# Then erase this (long) introductory message and do not repeat it with
# other instances of unknown features.
testmessage += "\n\n" + warn_on_undoc_feats
warn_on_undoc_feats = ''
warn(testmessage, testclass, testlevel, testid)
else:
lfrecord = effective_featset[attr]
if lfrecord['permitted'] == 0:
testid = 'feature-not-permitted'
testmessage = f"Feature {attr} is not permitted in language [{effective_lang}]."
if not altlang and len(warn_on_undoc_feats) > 0:
testmessage += "\n\n" + warn_on_undoc_feats
warn_on_undoc_feats = ''
warn(testmessage, testclass, testlevel, testid)
else:
values = lfrecord['uvalues'] + lfrecord['lvalues'] + lfrecord['unused_uvalues'] + lfrecord['unused_lvalues']
if not v in values:
testid = 'feature-value-unknown'
testmessage = f"Value {v} is not documented for feature {attr} in language [{effective_lang}]."
if not altlang and len(warn_on_undoc_feats) > 0:
testmessage += "\n\n" + warn_on_undoc_feats
warn_on_undoc_feats = ''
warn(testmessage, testclass, testlevel, testid)
elif not cols[UPOS] in lfrecord['byupos']:
testid = 'feature-upos-not-permitted'
testmessage = f"Feature {attr} is not permitted with UPOS {cols[UPOS]} in language [{effective_lang}]."
if not altlang and len(warn_on_undoc_feats) > 0:
testmessage += "\n\n" + warn_on_undoc_feats
warn_on_undoc_feats = ''
warn(testmessage, testclass, testlevel, testid)
elif not v in lfrecord['byupos'][cols[UPOS]] or lfrecord['byupos'][cols[UPOS]][v]==0:
testid = 'feature-value-upos-not-permitted'
testmessage = f"Value {v} of feature {attr} is not permitted with UPOS {cols[UPOS]} in language [{effective_lang}]."
if not altlang and len(warn_on_undoc_feats) > 0:
testmessage += "\n\n" + warn_on_undoc_feats
warn_on_undoc_feats = ''
warn(testmessage, testclass, testlevel, testid)
if len(attr_set) != len(feat_list):
testlevel = 2
testid = 'repeated-feature'
testmessage = f"Repeated features are disallowed: '{feats}'."
warn(testmessage, testclass, testlevel, testid)
if mwt_typo_span_end and int(mwt_typo_span_end) <= int(cols[ID]):
mwt_typo_span_end = None
def features_present():
"""
In general, the annotation of morphological features is optional, although
highly encouraged. However, if the treebank does have features, then certain
features become required. This function is called when the first morphological
feature is encountered. It remembers that from now on, missing features can
be reported as errors. In addition, if any such errors have already been
encountered, they will be reported now.
"""
global curr_line
global line_of_first_morpho_feature
global delayed_feature_errors
if not line_of_first_morpho_feature:
line_of_first_morpho_feature = curr_line
for testid in delayed_feature_errors:
for occurrence in delayed_feature_errors[testid]['occurrences']:
warn(delayed_feature_errors[testid]['message'],
delayed_feature_errors[testid]['class'],
delayed_feature_errors[testid]['level'],
testid, nodeid=occurrence['nodeid'],
lineno=occurrence['lineno'])
def validate_required_feature(feats, fv, testmessage, testlevel, testid, nodeid, lineno):
"""
In general, the annotation of morphological features is optional, although
highly encouraged. However, if the treebank does have features, then certain
features become required. This function will check the presence of a feature
and if it is missing, an error will be reported only if at least one feature
has been already encountered. Otherwise the error will be remembered and it
may be reported afterwards if any feature is encountered later.
"""
global line_of_first_morpho_feature
global delayed_feature_errors
testclass = 'Morpho'
###!!! We may want to check that any value of a given feature is present,
###!!! or even that a particular value is present. Currently we only test
###!!! Typo=Yes, i.e., the latter case.
if not fv in feats.split('|'):
if line_of_first_morpho_feature:
warn(testmessage, testclass, testlevel, testid, nodeid=nodeid, lineno=lineno)
else:
if not testid in delayed_feature_errors:
delayed_feature_errors[testid] = {'class': testclass, 'level': testlevel,
'message': testmessage, 'occurrences': []}
delayed_feature_errors[testid]['occurrences'].append({'nodeid': nodeid,
'lineno': lineno})
def validate_upos(cols, tag_sets):
if UPOS >= len(cols):
return # this has been already reported in trees()
if is_empty_node(cols) and cols[UPOS] == '_':
return
if tag_sets[UPOS] is not None and cols[UPOS] not in tag_sets[UPOS]:
testlevel = 2
testclass = 'Morpho'
testid = 'unknown-upos'
testmessage = f"Unknown UPOS tag: '{cols[UPOS]}'."
warn(testmessage, testclass, testlevel, testid)
def validate_deprels(cols, tag_sets, args):
global warn_on_undoc_deps
global warn_on_undoc_edeps
if DEPREL >= len(cols):
return # this has been already reported in trees()
# List of permited relations is language-specific.
# The current token may be in a different language due to code switching.
deprelset = tag_sets[DEPREL]
###!!! Unlike with features and auxiliaries, with deprels it is less clear
###!!! whether we actually want to switch the set of labels when the token
###!!! belongs to another language. If the set is changed at all, then it
###!!! should be a union of the main language and the token language.
###!!! Otherwise we risk that, e.g., we have allowed 'flat:name' for our
###!!! language, the maintainers of the other language have not allowed it,
###!!! and then we could not use it when the foreign language is active.
###!!! (This has actually happened in French GSD.)
altlang = None
#altlang = get_alt_language(cols[MISC])
#if altlang:
# deprelset = get_depreldata_for_language(altlang)
# Test only the universal part if testing at universal level.
deprel = cols[DEPREL]
testlevel = 4
if args.level < 4:
deprel = lspec2ud(deprel)
testlevel = 2
if deprelset is not None and deprel not in deprelset:
testclass = 'Syntax'
testid = 'unknown-deprel'
# If some relations were excluded because they are not documented,
# tell the user when the first unknown relation is encountered in the data.
# Then erase this (long) introductory message and do not repeat it with
# other instances of unknown relations.
testmessage = f"Unknown DEPREL label: '{cols[DEPREL]}'"
if not altlang and len(warn_on_undoc_deps) > 0:
testmessage += "\n\n" + warn_on_undoc_deps
warn_on_undoc_deps = ''
warn(testmessage, testclass, testlevel, testid)
if DEPS >= len(cols):
return # this has been already reported in trees()
if tag_sets[DEPS] is not None and cols[DEPS] != '_':
for head_deprel in cols[DEPS].split('|'):
try:
head,deprel=head_deprel.split(':', 1)
except ValueError:
testclass = 'Enhanced'
testid = 'invalid-head-deprel' # but it would have probably triggered another error above
testmessage = f"Malformed head:deprel pair '{head_deprel}'."
warn(testmessage, testclass, testlevel, testid)
continue
if args.level < 4:
deprel = lspec2ud(deprel)
if deprel not in tag_sets[DEPS]:
testclass = 'Enhanced'
testid = 'unknown-edeprel'
testmessage = f"Unknown enhanced relation type '{deprel}' in '{head_deprel}'"
if not altlang and len(warn_on_undoc_edeps) > 0:
testmessage += "\n\n" + warn_on_undoc_edeps
warn_on_undoc_edeps = ''
warn(testmessage, testclass, testlevel, testid)
##### Tests applicable to the whole sentence
def subset_to_words_and_empty_nodes(tree):
"""
Only picks word and empty node lines, skips multiword token lines.
"""
return [cols for cols in tree if is_word(cols) or is_empty_node(cols)]
def deps_list(cols):
if DEPS >= len(cols):
return # this has been already reported in trees()
if cols[DEPS] == '_':