From 08b4eff19f6b3bbce54f39b541f13596597a6235 Mon Sep 17 00:00:00 2001
From: Dana Sherson <robot@dana.sh>
Date: Fri, 30 Nov 2018 10:49:59 +1100
Subject: [PATCH] Fix tokenising when using using more than just a-zA-Z
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously: `Händler` would be tokenized as `ndler` or `ändler` depending on python version
Rather than the expected `händler`

Solution: use `regexp` rather than `re`.
This gives us the ability to use unicode character clasess such as `[[:upper:]]` and `[[:lower:]]`

Fixes #35
---
 .gitignore                       |  1 +
 scspell/__init__.py              | 20 ++++++++++----------
 scspell/_corpus.py               |  4 ++--
 setup.py                         |  3 ++-
 test.cram                        |  9 ++++++++-
 tests/basedicts/unicode-testfile |  1 +
 6 files changed, 24 insertions(+), 14 deletions(-)
 create mode 100644 tests/basedicts/unicode-testfile

diff --git a/.gitignore b/.gitignore
index af4be75..a699f2b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 /MANIFEST
 __pycache__/
 *.pyc
+test.cram.err
diff --git a/scspell/__init__.py b/scspell/__init__.py
index 4300f4e..6b6ab12 100644
--- a/scspell/__init__.py
+++ b/scspell/__init__.py
@@ -25,7 +25,7 @@
 
 import argparse
 import os
-import re
+import regex
 import sys
 import shutil
 import uuid
@@ -78,22 +78,22 @@
 # Treat anything alphanumeric as a token of interest, as long as it is not
 # immediately preceded by a single backslash.  (The string "\ntext" should
 # match on "text" rather than "ntext".)
-C_ESCAPE_TOKEN_REGEX = re.compile(r'(?<![^\\]\\)\w+')
+C_ESCAPE_TOKEN_REGEX = regex.compile(r'(?<![^\\]\\)\w+')
 
 # \ is not a character escape in e.g. LaTeX
-TOKEN_REGEX = re.compile(r'\w+')
+TOKEN_REGEX = regex.compile(r'\w+')
 
 # Hex digits will be treated as a special case, because they can look like
 # word-like even though they are actually numeric
-HEX_REGEX = re.compile(r'0x[0-9a-fA-F]+')
+HEX_REGEX = regex.compile(r'0x[0-9a-fA-F]+')
 
 # We assume that tokens will be split using either underscores,
 # digits, or camelCase conventions (or both)
-US_REGEX = re.compile(r'[_\d]+')
-CAMEL_WORD_REGEX = re.compile(r'([A-Z][a-z]*)')
+US_REGEX = regex.compile(r'[_\d]+')
+CAMEL_WORD_REGEX = regex.compile(r'([[:upper:]][[:lower:]]*)')
 
 # File-id specifiers take this form
-FILE_ID_REGEX = re.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')
+FILE_ID_REGEX = regex.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')
 
 
 class MatchDescriptor(object):
@@ -384,7 +384,7 @@ def handle_failed_check_interactively(
     print("%s:%u: Unmatched '%s' --> {%s}" %
           (filename, match_desc.get_line_num(), token,
            ', '.join([st for st in unmatched_subtokens])))
-    MATCH_REGEX = re.compile(re.escape(match_desc.get_token()))
+    MATCH_REGEX = regex.compile(regex.escape(match_desc.get_token()))
     while True:
         print("""\
    (i)gnore, (I)gnore all, (r)eplace, (R)eplace all, (a)dd to dictionary, or
@@ -405,7 +405,7 @@ def handle_failed_check_interactively(
       (Canceled.)\n""")
             else:
                 ignores.add(replacement.lower())
-                tail = re.sub(
+                tail = regex.sub(
                     MATCH_REGEX, replacement, match_desc.get_remainder(),
                     1 if ch == 'r' else 0)
                 print()
@@ -771,7 +771,7 @@ def add_to_dict(dictionary_type, word, files=[],
             dicts.add_by_file_id(word, file_id)
 
         elif dictionary_type[0] == 'p':
-            ext = re.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
+            ext = regex.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
             if not dicts.add_by_extension(word, ext):
                 print("Dictionary for file extension '{}' not found."
                       .format(ext), file=sys.stderr)
diff --git a/scspell/_corpus.py b/scspell/_corpus.py
index 6a3e4d8..1c9946c 100644
--- a/scspell/_corpus.py
+++ b/scspell/_corpus.py
@@ -29,7 +29,7 @@
 import io
 import json
 import os
-import re
+import regex
 import sys
 from bisect import bisect_left
 from . import _util
@@ -41,7 +41,7 @@
 
 
 # Valid file ID strings take this form
-FILE_ID_REGEX = re.compile(r'[a-zA-Z0-9_\-]+')
+FILE_ID_REGEX = regex.compile(r'[a-zA-Z0-9_\-]+')
 
 
 MATCH_NATURAL = 0x1
diff --git a/setup.py b/setup.py
index e8b77fa..7a8c474 100755
--- a/setup.py
+++ b/setup.py
@@ -43,5 +43,6 @@ def get_version():
         'Topic :: Software Development',
         'Topic :: Text Processing :: Linguistic',
         'Topic :: Utilities'],
-    platforms=['any']
+    platforms=['any'],
+    install_requires=['regex']
 )
diff --git a/test.cram b/test.cram
index 854fe3a..cdb0355 100755
--- a/test.cram
+++ b/test.cram
@@ -16,7 +16,6 @@ Test okay file.
     $ echo 'This is okay.' > good.txt
     $ $SCSPELL good.txt
 
-
 Test file with --override-dictionary and a fileid mapping entry
 
     $ cp -a "$TESTDIR/tests" .
@@ -27,6 +26,14 @@ Test file with --override-dictionary and a fileid mapping entry
     tests/fileidmap/inputfile2.txt:4: 'soem' not found in dictionary (from token 'soem')
     [1]
 
+Test spelling mistake with diacritics.
+
+    $ $SCSPELL 'tests/basedicts/unicode-testfile'
+    tests/basedicts/unicode-testfile:1: 'b\xe4dly' not found in dictionary (from token 'B\xe4dly')
+    tests/basedicts/unicode-testfile:1: '\xe1lmost' not found in dictionary (from token '\xc1lmost')
+    tests/basedicts/unicode-testfile:1: '\xe7\xe5m\xe9l', '\xe7\xe4se' were not found in the dictionary (from token '\xc7\xe5m\xe9l\xc7\xe4se')
+    [1]
+
 Test file ID manipulations
 
     $ $SCSPELL --override-dictionary tests/fileidmap/dictionary \
diff --git a/tests/basedicts/unicode-testfile b/tests/basedicts/unicode-testfile
new file mode 100644
index 0000000..30e7b38
--- /dev/null
+++ b/tests/basedicts/unicode-testfile
@@ -0,0 +1 @@
+Bädly Álmost ÇåmélÇäse