Skip to content

Commit

Permalink
Fix tokenising when using using more than just a-zA-Z
Browse files Browse the repository at this point in the history
Previously: `Händler` would be tokenized as `ndler` or `ändler` depending on python version
Rather than the expected `händler`

Solution: use `regexp` rather than `re`.
This gives us the ability to use unicode character clasess such as `[[:upper:]]` and `[[:lower:]]`

Fixes #35
  • Loading branch information
robotdana committed Nov 30, 2018
1 parent 70307ba commit 08b4eff
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 14 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
/MANIFEST
__pycache__/
*.pyc
test.cram.err
20 changes: 10 additions & 10 deletions scspell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

import argparse
import os
import re
import regex
import sys
import shutil
import uuid
Expand Down Expand Up @@ -78,22 +78,22 @@
# Treat anything alphanumeric as a token of interest, as long as it is not
# immediately preceded by a single backslash. (The string "\ntext" should
# match on "text" rather than "ntext".)
C_ESCAPE_TOKEN_REGEX = re.compile(r'(?<![^\\]\\)\w+')
C_ESCAPE_TOKEN_REGEX = regex.compile(r'(?<![^\\]\\)\w+')

# \ is not a character escape in e.g. LaTeX
TOKEN_REGEX = re.compile(r'\w+')
TOKEN_REGEX = regex.compile(r'\w+')

# Hex digits will be treated as a special case, because they can look like
# word-like even though they are actually numeric
HEX_REGEX = re.compile(r'0x[0-9a-fA-F]+')
HEX_REGEX = regex.compile(r'0x[0-9a-fA-F]+')

# We assume that tokens will be split using either underscores,
# digits, or camelCase conventions (or both)
US_REGEX = re.compile(r'[_\d]+')
CAMEL_WORD_REGEX = re.compile(r'([A-Z][a-z]*)')
US_REGEX = regex.compile(r'[_\d]+')
CAMEL_WORD_REGEX = regex.compile(r'([[:upper:]][[:lower:]]*)')

# File-id specifiers take this form
FILE_ID_REGEX = re.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')
FILE_ID_REGEX = regex.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')


class MatchDescriptor(object):
Expand Down Expand Up @@ -384,7 +384,7 @@ def handle_failed_check_interactively(
print("%s:%u: Unmatched '%s' --> {%s}" %
(filename, match_desc.get_line_num(), token,
', '.join([st for st in unmatched_subtokens])))
MATCH_REGEX = re.compile(re.escape(match_desc.get_token()))
MATCH_REGEX = regex.compile(regex.escape(match_desc.get_token()))
while True:
print("""\
(i)gnore, (I)gnore all, (r)eplace, (R)eplace all, (a)dd to dictionary, or
Expand All @@ -405,7 +405,7 @@ def handle_failed_check_interactively(
(Canceled.)\n""")
else:
ignores.add(replacement.lower())
tail = re.sub(
tail = regex.sub(
MATCH_REGEX, replacement, match_desc.get_remainder(),
1 if ch == 'r' else 0)
print()
Expand Down Expand Up @@ -771,7 +771,7 @@ def add_to_dict(dictionary_type, word, files=[],
dicts.add_by_file_id(word, file_id)

elif dictionary_type[0] == 'p':
ext = re.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
ext = regex.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
if not dicts.add_by_extension(word, ext):
print("Dictionary for file extension '{}' not found."
.format(ext), file=sys.stderr)
Expand Down
4 changes: 2 additions & 2 deletions scspell/_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import io
import json
import os
import re
import regex
import sys
from bisect import bisect_left
from . import _util
Expand All @@ -41,7 +41,7 @@


# Valid file ID strings take this form
FILE_ID_REGEX = re.compile(r'[a-zA-Z0-9_\-]+')
FILE_ID_REGEX = regex.compile(r'[a-zA-Z0-9_\-]+')


MATCH_NATURAL = 0x1
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ def get_version():
'Topic :: Software Development',
'Topic :: Text Processing :: Linguistic',
'Topic :: Utilities'],
platforms=['any']
platforms=['any'],
install_requires=['regex']
)
9 changes: 8 additions & 1 deletion test.cram
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ Test okay file.
$ echo 'This is okay.' > good.txt
$ $SCSPELL good.txt


Test file with --override-dictionary and a fileid mapping entry

$ cp -a "$TESTDIR/tests" .
Expand All @@ -27,6 +26,14 @@ Test file with --override-dictionary and a fileid mapping entry
tests/fileidmap/inputfile2.txt:4: 'soem' not found in dictionary (from token 'soem')
[1]

Test spelling mistake with diacritics.

$ $SCSPELL 'tests/basedicts/unicode-testfile'
tests/basedicts/unicode-testfile:1: 'b\xe4dly' not found in dictionary (from token 'B\xe4dly')
tests/basedicts/unicode-testfile:1: '\xe1lmost' not found in dictionary (from token '\xc1lmost')
tests/basedicts/unicode-testfile:1: '\xe7\xe5m\xe9l', '\xe7\xe4se' were not found in the dictionary (from token '\xc7\xe5m\xe9l\xc7\xe4se')
[1]

Test file ID manipulations

$ $SCSPELL --override-dictionary tests/fileidmap/dictionary \
Expand Down
1 change: 1 addition & 0 deletions tests/basedicts/unicode-testfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Bädly Álmost ÇåmélÇäse

0 comments on commit 08b4eff

Please sign in to comment.