From d8b361a8b1a7c6122b6681f4ec0f8b5e4379ccce Mon Sep 17 00:00:00 2001 From: Ulf Hermjakob Date: Sat, 5 Dec 2020 22:21:38 -0800 Subject: [PATCH] add CYRILLIC_PUNCT to wb analysis --- wildebeest_analysis.pl | 9 ++++++--- wildebeest_normalize.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/wildebeest_analysis.pl b/wildebeest_analysis.pl index fd5e2c8..8b7a64c 100755 --- a/wildebeest_analysis.pl +++ b/wildebeest_analysis.pl @@ -160,7 +160,8 @@ sub init_ht { CYRILLIC: Token contains Cyrillic character CYRILLIC_EXTENDED: Token contains Cyrillic extended character MIXED_CYRILLIC_LATIN: Token contains mix of Cyrillic and Latin - MIXED_PUNCT_CYRILLIC: Token contains mix of Punctuation followed by Cyrillic + PUNCT_CYRILLIC: Token contains punctuation followed by Cyrillic + CYRILLIC_PUNCT: Token contains Cyrillic followed by punctuation MIXED_CYRILLIC_PUNCT: Token contains mix of Cyrillic and Punctuation CYRILLIC_PLUS_PERIOD: Token contains Cyrillic and a period (possibly abbreviation) DEVANAGARI: Token contains Devanagari character (Indian languages) @@ -620,9 +621,11 @@ sub special_token_type { || ($token =~ /(?:\xEF\xB8[\x90-\x99\xB0-\xBF]|\xEF\xB9[\x80-\xAB]|\xEF\xBD[\x9B-\xA4]|\xF0\x9F[\xA0-\xA3])/)) { if ($token =~ /(?:[\xD0-\xD3]|\xD4[\x80-\xAF])/) { # ... Cyrillic if ($token =~ /^(?:(?:[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]|\xE2[\x80-\xAF]|\xC2[\xA0-\xBF]|\xC3[\x97\xB7]|\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]|\xEF\xB8[\x90-\x99\xB0-\xBF]|\xEF\xB9[\x80-\xAB]|\xEF\xBD[\x9B-\xA4]|\xF0\x9F[\xA0-\xA3])[\x80-\xBF]*)+(?:[\xD0-\xD3]|\xD4[\x80-\xAF])/) { - ¬e_issue("MIXED_PUNCT_CYRILLIC", $token, $line_id); - } elsif ($token =~ /^(?:[\xD0-\xD3]|\xD4[\x80-\xAF])+\.$/) { + ¬e_issue("PUNCT_CYRILLIC", $token, $line_id); + } elsif ($token =~ /(?:[\xD0-\xD3]|\xD4[\x80-\xAF])(?:[\x80-\xBF]*)\.$/) { ¬e_issue("CYRILLIC_PLUS_PERIOD", $token, $line_id); + } elsif ($token =~ /(?:[\xD0-\xD3]|\xD4[\x80-\xAF])(?:[\x80-\xBF]*)(?:(?:[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]|\xE2[\x80-\xAF]|\xC2[\xA0-\xBF]|\xC3[\x97\xB7]|\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]|\xEF\xB8[\x90-\x99\xB0-\xBF]|\xEF\xB9[\x80-\xAB]|\xEF\xBD[\x9B-\xA4]|\xF0\x9F[\xA0-\xA3])[\x80-\xBF]*)+$/) { + ¬e_issue("CYRILLIC_PUNCT", $token, $line_id); } else { ¬e_issue("MIXED_CYRILLIC_PUNCT", $token, $line_id); } diff --git a/wildebeest_normalize.py b/wildebeest_normalize.py index f48ac8b..060cbc5 100644 --- a/wildebeest_normalize.py +++ b/wildebeest_normalize.py @@ -65,8 +65,8 @@ log.basicConfig(level=log.INFO) -__version__ = '0.6.1' -last_mod_date = 'November 30, 2020' +__version__ = '0.6.2' +last_mod_date = 'December 5, 2020' class Wildebeest: