diff --git a/.gitignore b/.gitignore index af4be75..a699f2b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ /MANIFEST __pycache__/ *.pyc +test.cram.err diff --git a/scspell/__init__.py b/scspell/__init__.py index 4300f4e..2c9c951 100644 --- a/scspell/__init__.py +++ b/scspell/__init__.py @@ -25,10 +25,11 @@ import argparse import os -import re +import regex import sys import shutil import uuid +import unicodedata try: import ConfigParser @@ -78,22 +79,22 @@ # Treat anything alphanumeric as a token of interest, as long as it is not # immediately preceded by a single backslash. (The string "\ntext" should # match on "text" rather than "ntext".) -C_ESCAPE_TOKEN_REGEX = re.compile(r'(? {%s}" % (filename, match_desc.get_line_num(), token, ', '.join([st for st in unmatched_subtokens]))) - MATCH_REGEX = re.compile(re.escape(match_desc.get_token())) + MATCH_REGEX = regex.compile(regex.escape(match_desc.get_token())) while True: print("""\ (i)gnore, (I)gnore all, (r)eplace, (R)eplace all, (a)dd to dictionary, or @@ -405,7 +406,7 @@ def handle_failed_check_interactively( (Canceled.)\n""") else: ignores.add(replacement.lower()) - tail = re.sub( + tail = regex.sub( MATCH_REGEX, replacement, match_desc.get_remainder(), 1 if ch == 'r' else 0) print() @@ -771,7 +772,7 @@ def add_to_dict(dictionary_type, word, files=[], dicts.add_by_file_id(word, file_id) elif dictionary_type[0] == 'p': - ext = re.sub(r'.*\.', '.', '.{}'.format(files[0].lower())) + ext = regex.sub(r'.*\.', '.', '.{}'.format(files[0].lower())) if not dicts.add_by_extension(word, ext): print("Dictionary for file extension '{}' not found." .format(ext), file=sys.stderr) diff --git a/scspell/_corpus.py b/scspell/_corpus.py index 6a3e4d8..1c9946c 100644 --- a/scspell/_corpus.py +++ b/scspell/_corpus.py @@ -29,7 +29,7 @@ import io import json import os -import re +import regex import sys from bisect import bisect_left from . import _util @@ -41,7 +41,7 @@ # Valid file ID strings take this form -FILE_ID_REGEX = re.compile(r'[a-zA-Z0-9_\-]+') +FILE_ID_REGEX = regex.compile(r'[a-zA-Z0-9_\-]+') MATCH_NATURAL = 0x1 diff --git a/setup.py b/setup.py index e8b77fa..7a8c474 100755 --- a/setup.py +++ b/setup.py @@ -43,5 +43,6 @@ def get_version(): 'Topic :: Software Development', 'Topic :: Text Processing :: Linguistic', 'Topic :: Utilities'], - platforms=['any'] + platforms=['any'], + install_requires=['regex'] ) diff --git a/test.cram b/test.cram index 854fe3a..90ff02a 100755 --- a/test.cram +++ b/test.cram @@ -16,7 +16,6 @@ Test okay file. $ echo 'This is okay.' > good.txt $ $SCSPELL good.txt - Test file with --override-dictionary and a fileid mapping entry $ cp -a "$TESTDIR/tests" . @@ -27,6 +26,14 @@ Test file with --override-dictionary and a fileid mapping entry tests/fileidmap/inputfile2.txt:4: 'soem' not found in dictionary (from token 'soem') [1] +Test spelling mistake with diacritics. + + $ $SCSPELL 'tests/basedicts/unicode-testfile' + tests/basedicts/unicode-testfile:1: 'b\xc3\xa4dly' not found in dictionary (from token 'B\xc3\xa4dly') (esc) + tests/basedicts/unicode-testfile:1: '\xc3\xa1lmost' not found in dictionary (from token '\xc3\x81lmost') (esc) + tests/basedicts/unicode-testfile:1: '\xc3\xa7\xc3\xa5m\xc3\xa9l', '\xc3\xa7\xc3\xa4se' were not found in the dictionary (from token '\xc3\x87\xc3\xa5m\xc3\xa9l\xc3\x87\xc3\xa4se') (esc) + [1] + Test file ID manipulations $ $SCSPELL --override-dictionary tests/fileidmap/dictionary \ diff --git a/tests/basedicts/unicode-testfile b/tests/basedicts/unicode-testfile new file mode 100644 index 0000000..30e7b38 --- /dev/null +++ b/tests/basedicts/unicode-testfile @@ -0,0 +1 @@ +Bädly Álmost ÇåmélÇäse