From dab0369970ceff67e8edf1bedd866039380a1763 Mon Sep 17 00:00:00 2001 From: sciatro Date: Mon, 28 Jun 2021 21:42:38 -0400 Subject: [PATCH] Document that preprocessing.strip_punctuation is limited to ASCII (#2964) * Clarifying strip_punctuation limited to ASCII Add ASCII as qualification on `strip_punctuation` doc string. This is "option 1" fix for issue #2962 * Added code comment pointing to issue 2962 Code comment added linking to issue #2962 as a reminder of enhancement possibilities. * update CHANGELOG.md Co-authored-by: Michael Penkov --- CHANGELOG.md | 2 +- gensim/parsing/preprocessing.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e8b272287..5d2ae8845c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ Changes * [#3141](https://github.com/RaRe-Technologies/gensim/pull/3141): Update link for online LDA paper, by [@dymil](https://github.com/dymil) * [#3148](https://github.com/RaRe-Technologies/gensim/pull/3148): Fix broken link in documentation, by [@rohit901](https://github.com/rohit901) * [#3155](https://github.com/RaRe-Technologies/gensim/pull/3155): Correct parameter name in documentation of fasttext.py, by [@bizzyvinci](https://github.com/bizzyvinci) - +* [#2964](https://github.com/RaRe-Technologies/gensim/pull/2964): Document that preprocessing.strip_punctuation is limited to ASCII, by [@sciatro](https://github.com/sciatro) ## 4.0.1, 2021-04-01 Bugfix release to address issues with Wheels on Windows: diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 777ca46e8e..5fd45d2421 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -94,7 +94,7 @@ def remove_stopwords(s): def strip_punctuation(s): - """Replace punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`. + """Replace ASCII punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`. Parameters ---------- @@ -115,6 +115,7 @@ def strip_punctuation(s): """ s = utils.to_unicode(s) + # For unicode enhancement options see https://github.com/RaRe-Technologies/gensim/issues/2962 return RE_PUNCT.sub(" ", s)