From e702fa39a7f3a83818892ae26229aedd702217a7 Mon Sep 17 00:00:00 2001 From: g3n35i5 Date: Thu, 18 Jul 2019 09:54:40 +0200 Subject: [PATCH] Check for numbers in sentences can be switched off If the CorporaCreator is used with data in which it is valid that sentences contain numbers, there should be a way to allow them. With the optional command line parameter "-c" this check can now be skipped. Usage: create-corpora [other args] -c {true, false, t, f 0, 1, y, n, yes, no} --- .gitignore | 1 + src/corporacreator/argparse.py | 20 ++++++++++++++++++++ src/corporacreator/corpora.py | 8 ++++---- src/corporacreator/preprocessors/common.py | 4 ++-- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 22c6c74..fadb48f 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ MANIFEST # Per-project virtualenvs .virtualenv/ +venv/ diff --git a/src/corporacreator/argparse.py b/src/corporacreator/argparse.py index 1fbbc47..84afeaf 100644 --- a/src/corporacreator/argparse.py +++ b/src/corporacreator/argparse.py @@ -24,6 +24,17 @@ def _check_positive(value): return ivalue +def _check_boolean(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + def parse_args(args): """Parse command line parameters @@ -89,4 +100,13 @@ def parse_args(args): help="Maximum number of times a sentence can appear in a corpus.", dest="duplicate_sentence_count", ) + parser.add_argument( + "-c", + "--check-for-digits", + default=True, + required=False, + type=_check_boolean, + help="Check sentences for digits", + dest="check_for_digits", + ) return parser.parse_args(args) diff --git a/src/corporacreator/corpora.py b/src/corporacreator/corpora.py index 6b89946..fef0da1 100644 --- a/src/corporacreator/corpora.py +++ b/src/corporacreator/corpora.py @@ -12,9 +12,9 @@ _logger = logging.getLogger(__name__) -def common_wrapper(sentence, up_votes, down_votes): - is_valid, sentence = common(sentence) - if False == is_valid: +def common_wrapper(sentence, up_votes, down_votes, check_for_digits): + is_valid, sentence = common(sentence, check_for_digits) + if not is_valid: up_votes = 0 down_votes = 2 return pd.Series([sentence, up_votes, down_votes]) @@ -42,7 +42,7 @@ def create(self): corpora_data = self._parse_tsv() corpora_data[["sentence", "up_votes", "down_votes"]] = corpora_data[ ["sentence", "up_votes", "down_votes"] - ].swifter.apply(func=lambda arg: common_wrapper(*arg), axis=1) + ].swifter.apply(func=lambda arg: common_wrapper(*arg, self.args.check_for_digits), axis=1) if self.args.langs: # check if all languages provided at command line are actually # in the clips.tsv file, if not, throw error diff --git a/src/corporacreator/preprocessors/common.py b/src/corporacreator/preprocessors/common.py index cf0dff0..fa2644a 100644 --- a/src/corporacreator/preprocessors/common.py +++ b/src/corporacreator/preprocessors/common.py @@ -66,7 +66,7 @@ def _strip_string(sentence): return u''.join([c for c in sentence if unicodedata.category(c) in allowed_categories]) -def common(sentence): +def common(sentence, check_for_digits): """Cleans up the passed sentence in a language independent manner, removing or reformatting invalid data. Args: @@ -88,7 +88,7 @@ def common(sentence): sentence = (' ').join(sentence.split()) # TODO: Clean up data in a language independent manner # If the sentence contains digits reject it - if _has_digit(sentence): + if check_for_digits and _has_digit(sentence): is_valid = False # If the sentence is blank reject it if not sentence.strip():