Skip to content

Commit

Permalink
add CYRILLIC_PUNCT to wb analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Ulf Hermjakob authored and Ulf Hermjakob committed Dec 6, 2020
1 parent 9e23b65 commit d8b361a
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
9 changes: 6 additions & 3 deletions wildebeest_analysis.pl
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ sub init_ht {
CYRILLIC: Token contains Cyrillic character
CYRILLIC_EXTENDED: Token contains Cyrillic extended character
MIXED_CYRILLIC_LATIN: Token contains mix of Cyrillic and Latin
MIXED_PUNCT_CYRILLIC: Token contains mix of Punctuation followed by Cyrillic
PUNCT_CYRILLIC: Token contains punctuation followed by Cyrillic
CYRILLIC_PUNCT: Token contains Cyrillic followed by punctuation
MIXED_CYRILLIC_PUNCT: Token contains mix of Cyrillic and Punctuation
CYRILLIC_PLUS_PERIOD: Token contains Cyrillic and a period (possibly abbreviation)
DEVANAGARI: Token contains Devanagari character (Indian languages)
Expand Down Expand Up @@ -620,9 +621,11 @@ sub special_token_type {
|| ($token =~ /(?:\xEF\xB8[\x90-\x99\xB0-\xBF]|\xEF\xB9[\x80-\xAB]|\xEF\xBD[\x9B-\xA4]|\xF0\x9F[\xA0-\xA3])/)) {
if ($token =~ /(?:[\xD0-\xD3]|\xD4[\x80-\xAF])/) { # ... Cyrillic
if ($token =~ /^(?:(?:[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]|\xE2[\x80-\xAF]|\xC2[\xA0-\xBF]|\xC3[\x97\xB7]|\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]|\xEF\xB8[\x90-\x99\xB0-\xBF]|\xEF\xB9[\x80-\xAB]|\xEF\xBD[\x9B-\xA4]|\xF0\x9F[\xA0-\xA3])[\x80-\xBF]*)+(?:[\xD0-\xD3]|\xD4[\x80-\xAF])/) {
&note_issue("MIXED_PUNCT_CYRILLIC", $token, $line_id);
} elsif ($token =~ /^(?:[\xD0-\xD3]|\xD4[\x80-\xAF])+\.$/) {
&note_issue("PUNCT_CYRILLIC", $token, $line_id);
} elsif ($token =~ /(?:[\xD0-\xD3]|\xD4[\x80-\xAF])(?:[\x80-\xBF]*)\.$/) {
&note_issue("CYRILLIC_PLUS_PERIOD", $token, $line_id);
} elsif ($token =~ /(?:[\xD0-\xD3]|\xD4[\x80-\xAF])(?:[\x80-\xBF]*)(?:(?:[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]|\xE2[\x80-\xAF]|\xC2[\xA0-\xBF]|\xC3[\x97\xB7]|\xE3\x80[\x80-\x91\x94-\x9F\xB0\xBB-\xBD]|\xEF\xB8[\x90-\x99\xB0-\xBF]|\xEF\xB9[\x80-\xAB]|\xEF\xBD[\x9B-\xA4]|\xF0\x9F[\xA0-\xA3])[\x80-\xBF]*)+$/) {
&note_issue("CYRILLIC_PUNCT", $token, $line_id);
} else {
&note_issue("MIXED_CYRILLIC_PUNCT", $token, $line_id);
}
Expand Down
4 changes: 2 additions & 2 deletions wildebeest_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@

log.basicConfig(level=log.INFO)

__version__ = '0.6.1'
last_mod_date = 'November 30, 2020'
__version__ = '0.6.2'
last_mod_date = 'December 5, 2020'


class Wildebeest:
Expand Down

0 comments on commit d8b361a

Please sign in to comment.