From e702fa39a7f3a83818892ae26229aedd702217a7 Mon Sep 17 00:00:00 2001
From: g3n35i5 <janfschmidt@me.com>
Date: Thu, 18 Jul 2019 09:54:40 +0200
Subject: [PATCH] Check for numbers in sentences can be switched off

If the CorporaCreator is used with data in which it is valid that
sentences contain numbers, there should be a way to allow them.

With the optional command line parameter "-c" this check can now be
skipped.

Usage:

create-corpora [other args] -c {true, false, t, f 0, 1, y, n, yes, no}
---
 .gitignore                                 |  1 +
 src/corporacreator/argparse.py             | 20 ++++++++++++++++++++
 src/corporacreator/corpora.py              |  8 ++++----
 src/corporacreator/preprocessors/common.py |  4 ++--
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 22c6c74..fadb48f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,4 @@ MANIFEST
 
 # Per-project virtualenvs
 .virtualenv/
+venv/
diff --git a/src/corporacreator/argparse.py b/src/corporacreator/argparse.py
index 1fbbc47..84afeaf 100644
--- a/src/corporacreator/argparse.py
+++ b/src/corporacreator/argparse.py
@@ -24,6 +24,17 @@ def _check_positive(value):
     return ivalue
 
 
+def _check_boolean(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
 def parse_args(args):
     """Parse command line parameters
 
@@ -89,4 +100,13 @@ def parse_args(args):
         help="Maximum number of times a sentence can appear in a corpus.",
         dest="duplicate_sentence_count",
     )
+    parser.add_argument(
+        "-c",
+        "--check-for-digits",
+        default=True,
+        required=False,
+        type=_check_boolean,
+        help="Check sentences for digits",
+        dest="check_for_digits",
+    )
     return parser.parse_args(args)
diff --git a/src/corporacreator/corpora.py b/src/corporacreator/corpora.py
index 6b89946..fef0da1 100644
--- a/src/corporacreator/corpora.py
+++ b/src/corporacreator/corpora.py
@@ -12,9 +12,9 @@
 _logger = logging.getLogger(__name__)
 
 
-def common_wrapper(sentence, up_votes, down_votes):
-    is_valid, sentence = common(sentence)
-    if False == is_valid:
+def common_wrapper(sentence, up_votes, down_votes, check_for_digits):
+    is_valid, sentence = common(sentence, check_for_digits)
+    if not is_valid:
         up_votes = 0
         down_votes = 2
     return pd.Series([sentence, up_votes, down_votes])
@@ -42,7 +42,7 @@ def create(self):
         corpora_data = self._parse_tsv()
         corpora_data[["sentence", "up_votes", "down_votes"]] = corpora_data[
             ["sentence", "up_votes", "down_votes"]
-        ].swifter.apply(func=lambda arg: common_wrapper(*arg), axis=1)
+        ].swifter.apply(func=lambda arg: common_wrapper(*arg, self.args.check_for_digits), axis=1)
         if self.args.langs:
             # check if all languages provided at command line are actually
             # in the clips.tsv file, if not, throw error
diff --git a/src/corporacreator/preprocessors/common.py b/src/corporacreator/preprocessors/common.py
index cf0dff0..fa2644a 100644
--- a/src/corporacreator/preprocessors/common.py
+++ b/src/corporacreator/preprocessors/common.py
@@ -66,7 +66,7 @@ def _strip_string(sentence):
     return u''.join([c for c in sentence if unicodedata.category(c) in allowed_categories])
 
 
-def common(sentence):
+def common(sentence, check_for_digits):
     """Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data.
 
     Args:
@@ -88,7 +88,7 @@ def common(sentence):
     sentence = (' ').join(sentence.split())
     # TODO: Clean up data in a language independent manner
     # If the sentence contains digits reject it
-    if _has_digit(sentence):
+    if check_for_digits and _has_digit(sentence):
         is_valid = False
     # If the sentence is blank reject it
     if not sentence.strip():