From 08b4eff19f6b3bbce54f39b541f13596597a6235 Mon Sep 17 00:00:00 2001 From: Dana Sherson Date: Fri, 30 Nov 2018 10:49:59 +1100 Subject: [PATCH] Fix tokenising when using using more than just a-zA-Z MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously: `Händler` would be tokenized as `ndler` or `ändler` depending on python version Rather than the expected `händler` Solution: use `regexp` rather than `re`. This gives us the ability to use unicode character clasess such as `[[:upper:]]` and `[[:lower:]]` Fixes #35 --- .gitignore | 1 + scspell/__init__.py | 20 ++++++++++---------- scspell/_corpus.py | 4 ++-- setup.py | 3 ++- test.cram | 9 ++++++++- tests/basedicts/unicode-testfile | 1 + 6 files changed, 24 insertions(+), 14 deletions(-) create mode 100644 tests/basedicts/unicode-testfile diff --git a/.gitignore b/.gitignore index af4be75..a699f2b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ /MANIFEST __pycache__/ *.pyc +test.cram.err diff --git a/scspell/__init__.py b/scspell/__init__.py index 4300f4e..6b6ab12 100644 --- a/scspell/__init__.py +++ b/scspell/__init__.py @@ -25,7 +25,7 @@ import argparse import os -import re +import regex import sys import shutil import uuid @@ -78,22 +78,22 @@ # Treat anything alphanumeric as a token of interest, as long as it is not # immediately preceded by a single backslash. (The string "\ntext" should # match on "text" rather than "ntext".) -C_ESCAPE_TOKEN_REGEX = re.compile(r'(? {%s}" % (filename, match_desc.get_line_num(), token, ', '.join([st for st in unmatched_subtokens]))) - MATCH_REGEX = re.compile(re.escape(match_desc.get_token())) + MATCH_REGEX = regex.compile(regex.escape(match_desc.get_token())) while True: print("""\ (i)gnore, (I)gnore all, (r)eplace, (R)eplace all, (a)dd to dictionary, or @@ -405,7 +405,7 @@ def handle_failed_check_interactively( (Canceled.)\n""") else: ignores.add(replacement.lower()) - tail = re.sub( + tail = regex.sub( MATCH_REGEX, replacement, match_desc.get_remainder(), 1 if ch == 'r' else 0) print() @@ -771,7 +771,7 @@ def add_to_dict(dictionary_type, word, files=[], dicts.add_by_file_id(word, file_id) elif dictionary_type[0] == 'p': - ext = re.sub(r'.*\.', '.', '.{}'.format(files[0].lower())) + ext = regex.sub(r'.*\.', '.', '.{}'.format(files[0].lower())) if not dicts.add_by_extension(word, ext): print("Dictionary for file extension '{}' not found." .format(ext), file=sys.stderr) diff --git a/scspell/_corpus.py b/scspell/_corpus.py index 6a3e4d8..1c9946c 100644 --- a/scspell/_corpus.py +++ b/scspell/_corpus.py @@ -29,7 +29,7 @@ import io import json import os -import re +import regex import sys from bisect import bisect_left from . import _util @@ -41,7 +41,7 @@ # Valid file ID strings take this form -FILE_ID_REGEX = re.compile(r'[a-zA-Z0-9_\-]+') +FILE_ID_REGEX = regex.compile(r'[a-zA-Z0-9_\-]+') MATCH_NATURAL = 0x1 diff --git a/setup.py b/setup.py index e8b77fa..7a8c474 100755 --- a/setup.py +++ b/setup.py @@ -43,5 +43,6 @@ def get_version(): 'Topic :: Software Development', 'Topic :: Text Processing :: Linguistic', 'Topic :: Utilities'], - platforms=['any'] + platforms=['any'], + install_requires=['regex'] ) diff --git a/test.cram b/test.cram index 854fe3a..cdb0355 100755 --- a/test.cram +++ b/test.cram @@ -16,7 +16,6 @@ Test okay file. $ echo 'This is okay.' > good.txt $ $SCSPELL good.txt - Test file with --override-dictionary and a fileid mapping entry $ cp -a "$TESTDIR/tests" . @@ -27,6 +26,14 @@ Test file with --override-dictionary and a fileid mapping entry tests/fileidmap/inputfile2.txt:4: 'soem' not found in dictionary (from token 'soem') [1] +Test spelling mistake with diacritics. + + $ $SCSPELL 'tests/basedicts/unicode-testfile' + tests/basedicts/unicode-testfile:1: 'b\xe4dly' not found in dictionary (from token 'B\xe4dly') + tests/basedicts/unicode-testfile:1: '\xe1lmost' not found in dictionary (from token '\xc1lmost') + tests/basedicts/unicode-testfile:1: '\xe7\xe5m\xe9l', '\xe7\xe4se' were not found in the dictionary (from token '\xc7\xe5m\xe9l\xc7\xe4se') + [1] + Test file ID manipulations $ $SCSPELL --override-dictionary tests/fileidmap/dictionary \ diff --git a/tests/basedicts/unicode-testfile b/tests/basedicts/unicode-testfile new file mode 100644 index 0000000..30e7b38 --- /dev/null +++ b/tests/basedicts/unicode-testfile @@ -0,0 +1 @@ +Bädly Álmost ÇåmélÇäse