diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py index b12c2cc2..8ea203dd 100644 --- a/src/invoice2data/extract/invoice_template.py +++ b/src/invoice2data/extract/invoice_template.py @@ -14,17 +14,17 @@ logger = logging.getLogger(__name__) OPTIONS_DEFAULT = { - 'remove_whitespace': False, - 'remove_accents': False, - 'lowercase': False, - 'currency': 'EUR', - 'date_formats': [], - 'languages': [], - 'decimal_separator': '.', - 'replace': [], # example: see templates/fr/fr.free.mobile.yml + "remove_whitespace": False, + "remove_accents": False, + "lowercase": False, + "currency": "EUR", + "date_formats": [], + "languages": [], + "decimal_separator": ".", + "replace": [], # example: see templates/fr/fr.free.mobile.yml } -PLUGIN_MAPPING = {'lines': lines, 'tables': tables} +PLUGIN_MAPPING = {"lines": lines, "tables": tables} class InvoiceTemplate(OrderedDict): @@ -53,15 +53,15 @@ def __init__(self, *args, **kwargs): # Merge template-specific options with defaults self.options = OPTIONS_DEFAULT.copy() - for lang in self.options['languages']: - assert len(lang) == 2, 'lang code must have 2 letters' + for lang in self.options["languages"]: + assert len(lang) == 2, "lang code must have 2 letters" - if 'options' in self: - self.options.update(self['options']) + if "options" in self: + self.options.update(self["options"]) # Set issuer, if it doesn't exist. - if 'issuer' not in self.keys(): - self['issuer'] = self['keywords'][0] + if "issuer" not in self.keys(): + self["issuer"] = self["keywords"][0] def prepare_input(self, extracted_str): """ @@ -69,22 +69,22 @@ def prepare_input(self, extracted_str): """ # Remove withspace - if self.options['remove_whitespace']: - optimized_str = re.sub(' +', '', extracted_str) + if self.options["remove_whitespace"]: + optimized_str = re.sub(" +", "", extracted_str) else: optimized_str = extracted_str # Remove accents - if self.options['remove_accents']: + if self.options["remove_accents"]: optimized_str = unidecode(optimized_str) # convert to lower case - if self.options['lowercase']: + if self.options["lowercase"]: optimized_str = optimized_str.lower() # specific replace - for replace in self.options['replace']: - assert len(replace) == 2, 'A replace should be a list of 2 items' + for replace in self.options["replace"]: + assert len(replace) == 2, "A replace should be a list of 2 items" optimized_str = optimized_str.replace(replace[0], replace[1]) return optimized_str @@ -92,72 +92,76 @@ def prepare_input(self, extracted_str): def matches_input(self, optimized_str): """See if string matches keywords set in template file""" - if all([keyword in optimized_str for keyword in self['keywords']]): - logger.debug('Matched template %s', self['template_name']) + if all([keyword in optimized_str for keyword in self["keywords"]]): + logger.debug("Matched template %s", self["template_name"]) return True def parse_number(self, value): assert ( - value.count(self.options['decimal_separator']) < 2 - ), 'Decimal separator cannot be present several times' + value.count(self.options["decimal_separator"]) < 2 + ), "Decimal separator cannot be present several times" # replace decimal separator by a | - amount_pipe = value.replace(self.options['decimal_separator'], '|') + amount_pipe = value.replace(self.options["decimal_separator"], "|") # remove all possible thousands separators - amount_pipe_no_thousand_sep = re.sub(r'[.,\s]', '', amount_pipe) + amount_pipe_no_thousand_sep = re.sub(r"[.,\s]", "", amount_pipe) # put dot as decimal sep - return float(amount_pipe_no_thousand_sep.replace('|', '.')) + return float(amount_pipe_no_thousand_sep.replace("|", ".")) def parse_date(self, value): """Parses date and returns date after parsing""" res = dateparser.parse( - value, date_formats=self.options['date_formats'], languages=self.options['languages'] + value, + date_formats=self.options["date_formats"], + languages=self.options["languages"], ) logger.debug("result of date parsing=%s", res) return res def coerce_type(self, value, target_type): - if target_type == 'int': + if target_type == "int": if not value.strip(): return 0 return int(self.parse_number(value)) - elif target_type == 'float': + elif target_type == "float": if not value.strip(): return 0.0 return float(self.parse_number(value)) - elif target_type == 'date': + elif target_type == "date": return self.parse_date(value) - assert False, 'Unknown type' + assert False, "Unknown type" def extract(self, optimized_str): """ Given a template file and a string, extract matching data fields. """ - logger.debug('START optimized_str ========================') + logger.debug("START optimized_str ========================") logger.debug(optimized_str) - logger.debug('END optimized_str ==========================') + logger.debug("END optimized_str ==========================") logger.debug( - 'Date parsing: languages=%s date_formats=%s', - self.options['languages'], - self.options['date_formats'], + "Date parsing: languages=%s date_formats=%s", + self.options["languages"], + self.options["date_formats"], ) - logger.debug('Float parsing: decimal separator=%s', self.options['decimal_separator']) - logger.debug("keywords=%s", self['keywords']) + logger.debug( + "Float parsing: decimal separator=%s", self.options["decimal_separator"] + ) + logger.debug("keywords=%s", self["keywords"]) logger.debug(self.options) # Try to find data for each field. output = {} - output['issuer'] = self['issuer'] + output["issuer"] = self["issuer"] - for k, v in self['fields'].items(): - if k.startswith('static_'): + for k, v in self["fields"].items(): + if k.startswith("static_"): logger.debug("field=%s | static value=%s", k, v) - output[k.replace('static_', '')] = v + output[k.replace("static_", "")] = v else: logger.debug("field=%s | regexp=%s", k, v) sum_field = False - if k.startswith('sum_amount') and type(v) is list: + if k.startswith("sum_amount") and type(v) is list: k = k[4:] # remove 'sum_' prefix sum_field = True # Fields can have multiple expressions @@ -174,12 +178,14 @@ def extract(self, optimized_str): res_find = re.findall(v, optimized_str) if res_find: logger.debug("res_find=%s", res_find) - if k.startswith('date') or k.endswith('date'): + if k.startswith("date") or k.endswith("date"): output[k] = self.parse_date(res_find[0]) if not output[k]: - logger.error("Date parsing failed on date '%s'", res_find[0]) + logger.error( + "Date parsing failed on date '%s'", res_find[0] + ) return None - elif k.startswith('amount'): + elif k.startswith("amount"): if sum_field: output[k] = 0 for amount_to_parse in res_find: @@ -195,7 +201,7 @@ def extract(self, optimized_str): else: logger.warning("regexp for field %s didn't match", k) - output['currency'] = self.options['currency'] + output["currency"] = self.options["currency"] # Run plugins: for plugin_keyword, plugin_func in PLUGIN_MAPPING.items(): @@ -203,22 +209,24 @@ def extract(self, optimized_str): plugin_func.extract(self, optimized_str, output) # If required fields were found, return output, else log error. - if 'required_fields' not in self.keys(): - required_fields = ['date', 'amount', 'invoice_number', 'issuer'] + if "required_fields" not in self.keys(): + required_fields = ["date", "amount", "invoice_number", "issuer"] else: required_fields = [] - for v in self['required_fields']: + for v in self["required_fields"]: required_fields.append(v) if set(required_fields).issubset(output.keys()): - output['desc'] = 'Invoice from %s' % (self['issuer']) + output["desc"] = "Invoice from %s" % (self["issuer"]) logger.debug(output) return output else: fields = list(set(output.keys())) logger.error( - 'Unable to match all required fields. ' - 'The required fields are: {0}. ' - 'Output contains the following fields: {1}.'.format(required_fields, fields) + "Unable to match all required fields. " + "The required fields are: {0}. " + "Output contains the following fields: {1}.".format( + required_fields, fields + ) ) return None diff --git a/src/invoice2data/extract/loader.py b/src/invoice2data/extract/loader.py index 4ae31f3c..48de256a 100644 --- a/src/invoice2data/extract/loader.py +++ b/src/invoice2data/extract/loader.py @@ -13,7 +13,7 @@ import codecs import chardet -logging.getLogger('chardet').setLevel(logging.WARNING) +logging.getLogger("chardet").setLevel(logging.WARNING) # borrowed from http://stackoverflow.com/a/21912744 @@ -31,7 +31,9 @@ def construct_mapping(loader, node): loader.flatten_mapping(node) return object_pairs_hook(loader.construct_pairs(node)) - OrderedLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping) + OrderedLoader.add_constructor( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping + ) return yaml.load(stream, OrderedLoader) @@ -77,23 +79,25 @@ def read_templates(folder=None): output = [] if folder is None: - folder = pkg_resources.resource_filename(__name__, 'templates') + folder = pkg_resources.resource_filename(__name__, "templates") for path, subdirs, files in os.walk(folder): for name in sorted(files): - if name.endswith('.yml'): - with open(os.path.join(path, name), 'rb') as f: - encoding = chardet.detect(f.read())['encoding'] - with codecs.open(os.path.join(path, name), encoding=encoding) as template_file: + if name.endswith(".yml"): + with open(os.path.join(path, name), "rb") as f: + encoding = chardet.detect(f.read())["encoding"] + with codecs.open( + os.path.join(path, name), encoding=encoding + ) as template_file: tpl = ordered_load(template_file.read()) - tpl['template_name'] = name + tpl["template_name"] = name # Test if all required fields are in template: - assert 'keywords' in tpl.keys(), 'Missing keywords field.' + assert "keywords" in tpl.keys(), "Missing keywords field." # Keywords as list, if only one. - if type(tpl['keywords']) is not list: - tpl['keywords'] = [tpl['keywords']] + if type(tpl["keywords"]) is not list: + tpl["keywords"] = [tpl["keywords"]] output.append(InvoiceTemplate(tpl)) return output diff --git a/src/invoice2data/extract/plugins/lines.py b/src/invoice2data/extract/plugins/lines.py index 34727dcf..171e893a 100644 --- a/src/invoice2data/extract/plugins/lines.py +++ b/src/invoice2data/extract/plugins/lines.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -DEFAULT_OPTIONS = {'field_separator': r'\s+', 'line_separator': r'\n'} +DEFAULT_OPTIONS = {"field_separator": r"\s+", "line_separator": r"\n"} def extract(self, content, output): @@ -17,73 +17,73 @@ def extract(self, content, output): # First apply default options. plugin_settings = DEFAULT_OPTIONS.copy() - plugin_settings.update(self['lines']) - self['lines'] = plugin_settings + plugin_settings.update(self["lines"]) + self["lines"] = plugin_settings # Validate settings - assert 'start' in self['lines'], 'Lines start regex missing' - assert 'end' in self['lines'], 'Lines end regex missing' - assert 'line' in self['lines'], 'Line regex missing' + assert "start" in self["lines"], "Lines start regex missing" + assert "end" in self["lines"], "Lines end regex missing" + assert "line" in self["lines"], "Line regex missing" - start = re.search(self['lines']['start'], content) - end = re.search(self['lines']['end'], content) + start = re.search(self["lines"]["start"], content) + end = re.search(self["lines"]["end"], content) if not start or not end: - logger.warning('no lines found - start %s, end %s', start, end) + logger.warning("no lines found - start %s, end %s", start, end) return - content = content[start.end(): end.start()] + content = content[start.end() : end.start()] lines = [] current_row = {} - if 'first_line' not in self['lines'] and 'last_line' not in self['lines']: - self['lines']['first_line'] = self['lines']['line'] - for line in re.split(self['lines']['line_separator'], content): + if "first_line" not in self["lines"] and "last_line" not in self["lines"]: + self["lines"]["first_line"] = self["lines"]["line"] + for line in re.split(self["lines"]["line_separator"], content): # if the line has empty lines in it , skip them - if not line.strip('').strip('\n') or not line: + if not line.strip("").strip("\n") or not line: continue - if 'first_line' in self['lines']: - match = re.search(self['lines']['first_line'], line) + if "first_line" in self["lines"]: + match = re.search(self["lines"]["first_line"], line) if match: - if 'last_line' not in self['lines']: + if "last_line" not in self["lines"]: if current_row: lines.append(current_row) current_row = {} if current_row: lines.append(current_row) current_row = { - field: value.strip() if value else '' + field: value.strip() if value else "" for field, value in match.groupdict().items() } continue - if 'last_line' in self['lines']: - match = re.search(self['lines']['last_line'], line) + if "last_line" in self["lines"]: + match = re.search(self["lines"]["last_line"], line) if match: for field, value in match.groupdict().items(): - current_row[field] = '%s%s%s' % ( - current_row.get(field, ''), - current_row.get(field, '') and '\n' or '', - value.strip() if value else '', + current_row[field] = "%s%s%s" % ( + current_row.get(field, ""), + current_row.get(field, "") and "\n" or "", + value.strip() if value else "", ) if current_row: lines.append(current_row) current_row = {} continue - match = re.search(self['lines']['line'], line) + match = re.search(self["lines"]["line"], line) if match: for field, value in match.groupdict().items(): - current_row[field] = '%s%s%s' % ( - current_row.get(field, ''), - current_row.get(field, '') and '\n' or '', - value.strip() if value else '', + current_row[field] = "%s%s%s" % ( + current_row.get(field, ""), + current_row.get(field, "") and "\n" or "", + value.strip() if value else "", ) continue - logger.debug('ignoring *%s* because it doesn\'t match anything', line) + logger.debug("ignoring *%s* because it doesn't match anything", line) if current_row: lines.append(current_row) - types = self['lines'].get('types', []) + types = self["lines"].get("types", []) for row in lines: for name in row.keys(): if name in types: row[name] = self.coerce_type(row[name], types[name]) if lines: - output['lines'] = lines + output["lines"] = lines diff --git a/src/invoice2data/extract/plugins/tables.py b/src/invoice2data/extract/plugins/tables.py index 1a537a6e..ea979135 100644 --- a/src/invoice2data/extract/plugins/tables.py +++ b/src/invoice2data/extract/plugins/tables.py @@ -7,13 +7,13 @@ logger = logging.getLogger(__name__) -DEFAULT_OPTIONS = {'field_separator': r'\s+', 'line_separator': r'\n'} +DEFAULT_OPTIONS = {"field_separator": r"\s+", "line_separator": r"\n"} def extract(self, content, output): """Try to extract tables from an invoice""" - for table in self['tables']: + for table in self["tables"]: # First apply default options. plugin_settings = DEFAULT_OPTIONS.copy() @@ -21,38 +21,38 @@ def extract(self, content, output): table = plugin_settings # Validate settings - assert 'start' in table, 'Table start regex missing' - assert 'end' in table, 'Table end regex missing' - assert 'body' in table, 'Table body regex missing' + assert "start" in table, "Table start regex missing" + assert "end" in table, "Table end regex missing" + assert "body" in table, "Table body regex missing" - start = re.search(table['start'], content) - end = re.search(table['end'], content) + start = re.search(table["start"], content) + end = re.search(table["end"], content) if not start or not end: - logger.warning('no table body found - start %s, end %s', start, end) + logger.warning("no table body found - start %s, end %s", start, end) continue - table_body = content[start.end(): end.start()] + table_body = content[start.end() : end.start()] - for line in re.split(table['line_separator'], table_body): + for line in re.split(table["line_separator"], table_body): # if the line has empty lines in it , skip them - if not line.strip('').strip('\n') or not line: + if not line.strip("").strip("\n") or not line: continue - match = re.search(table['body'], line) + match = re.search(table["body"], line) if match: for field, value in match.groupdict().items(): # If a field name already exists, do not overwrite it if field in output: continue - if field.startswith('date') or field.endswith('date'): + if field.startswith("date") or field.endswith("date"): output[field] = self.parse_date(value) if not output[field]: logger.error("Date parsing failed on date '%s'", value) return None - elif field.startswith('amount'): + elif field.startswith("amount"): output[field] = self.parse_number(value) else: output[field] = value - logger.debug('ignoring *%s* because it doesn\'t match anything', line) + logger.debug("ignoring *%s* because it doesn't match anything", line) diff --git a/src/invoice2data/input/gvision.py b/src/invoice2data/input/gvision.py index cfb48b49..6e7ea63c 100644 --- a/src/invoice2data/input/gvision.py +++ b/src/invoice2data/input/gvision.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -def to_text(path, bucket_name='cloud-vision-84893', language='fr'): +def to_text(path, bucket_name="cloud-vision-84893", language="fr"): """Sends PDF files to Google Cloud Vision for OCR. Before using invoice2data, make sure you have the auth json path set as @@ -26,13 +26,13 @@ def to_text(path, bucket_name='cloud-vision-84893', language='fr'): from google.protobuf import json_format # Supported mime_types are: 'application/pdf' and 'image/tiff' - mime_type = 'application/pdf' + mime_type = "application/pdf" path_dir, filename = os.path.split(path) - result_blob_basename = filename.replace('.pdf', '').replace('.PDF', '') - result_blob_name = result_blob_basename + '/output-1-to-1.json' - result_blob_uri = 'gs://{}/{}/'.format(bucket_name, result_blob_basename) - input_blob_uri = 'gs://{}/{}'.format(bucket_name, filename) + result_blob_basename = filename.replace(".pdf", "").replace(".PDF", "") + result_blob_name = result_blob_basename + "/output-1-to-1.json" + result_blob_uri = "gs://{}/{}/".format(bucket_name, result_blob_basename) + input_blob_uri = "gs://{}/{}".format(bucket_name, filename) # Upload file to gcloud if it doesn't exist yet storage_client = storage.Client() @@ -51,10 +51,14 @@ def to_text(path, bucket_name='cloud-vision-84893', language='fr'): client = vision.ImageAnnotatorClient() - feature = vision.types.Feature(type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) + feature = vision.types.Feature( + type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION + ) gcs_source = vision.types.GcsSource(uri=input_blob_uri) - input_config = vision.types.InputConfig(gcs_source=gcs_source, mime_type=mime_type) + input_config = vision.types.InputConfig( + gcs_source=gcs_source, mime_type=mime_type + ) gcs_destination = vision.types.GcsDestination(uri=result_blob_uri) output_config = vision.types.OutputConfig( @@ -67,7 +71,7 @@ def to_text(path, bucket_name='cloud-vision-84893', language='fr'): operation = client.async_batch_annotate_files(requests=[async_request]) - print('Waiting for the operation to finish.') + print("Waiting for the operation to finish.") operation.result(timeout=180) # Get result after OCR is completed @@ -80,4 +84,4 @@ def to_text(path, bucket_name='cloud-vision-84893', language='fr'): first_page_response = response.responses[0] annotation = first_page_response.full_text_annotation - return annotation.text.encode('utf-8') + return annotation.text.encode("utf-8") diff --git a/src/invoice2data/input/pdfminer_wrapper.py b/src/invoice2data/input/pdfminer_wrapper.py index 6852ecd0..c0c14b71 100644 --- a/src/invoice2data/input/pdfminer_wrapper.py +++ b/src/invoice2data/input/pdfminer_wrapper.py @@ -20,7 +20,7 @@ def to_text(path): import sys reload(sys) # noqa: F821 - sys.setdefaultencoding('utf8') + sys.setdefaultencoding("utf8") except ImportError: from io import StringIO @@ -31,11 +31,11 @@ def to_text(path): rsrcmgr = PDFResourceManager() retstr = StringIO() - codec = 'utf-8' + codec = "utf-8" laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) - with open(path, 'rb') as fp: + with open(path, "rb") as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 @@ -54,4 +54,4 @@ def to_text(path): device.close() str = retstr.getvalue() retstr.close() - return str.encode('utf-8') + return str.encode("utf-8") diff --git a/src/invoice2data/input/pdftotext.py b/src/invoice2data/input/pdftotext.py index 6afb3039..4ba37992 100644 --- a/src/invoice2data/input/pdftotext.py +++ b/src/invoice2data/input/pdftotext.py @@ -22,10 +22,10 @@ def to_text(path): if spawn.find_executable("pdftotext"): # shutil.which('pdftotext'): out, err = subprocess.Popen( - ["pdftotext", '-layout', '-enc', 'UTF-8', path, '-'], stdout=subprocess.PIPE + ["pdftotext", "-layout", "-enc", "UTF-8", path, "-"], stdout=subprocess.PIPE ).communicate() return out else: raise EnvironmentError( - 'pdftotext not installed. Can be downloaded from https://poppler.freedesktop.org/' + "pdftotext not installed. Can be downloaded from https://poppler.freedesktop.org/" ) diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py index ab2bd7ea..0a7a3c31 100644 --- a/src/invoice2data/input/tesseract.py +++ b/src/invoice2data/input/tesseract.py @@ -19,16 +19,26 @@ def to_text(path): from distutils import spawn # Check for dependencies. Needs Tesseract and Imagemagick installed. - if not spawn.find_executable('tesseract'): - raise EnvironmentError('tesseract not installed.') - if not spawn.find_executable('convert'): - raise EnvironmentError('imagemagick not installed.') + if not spawn.find_executable("tesseract"): + raise EnvironmentError("tesseract not installed.") + if not spawn.find_executable("convert"): + raise EnvironmentError("imagemagick not installed.") # convert = "convert -density 350 %s -depth 8 tiff:-" % (path) - convert = ['convert', '-density', '350', path, '-depth', '8', '-alpha', 'off', 'png:-'] + convert = [ + "convert", + "-density", + "350", + path, + "-depth", + "8", + "-alpha", + "off", + "png:-", + ] p1 = subprocess.Popen(convert, stdout=subprocess.PIPE) - tess = ['tesseract', 'stdin', 'stdout'] + tess = ["tesseract", "stdin", "stdout"] p2 = subprocess.Popen(tess, stdin=p1.stdout, stdout=subprocess.PIPE) out, err = p2.communicate() diff --git a/src/invoice2data/input/tesseract4.py b/src/invoice2data/input/tesseract4.py index b406a585..83b41570 100644 --- a/src/invoice2data/input/tesseract4.py +++ b/src/invoice2data/input/tesseract4.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -def to_text(path, language='fra'): +def to_text(path, language="fra"): """Wraps Tesseract 4 OCR with custom language model. Parameters @@ -19,47 +19,57 @@ def to_text(path, language='fra'): import time # Check for dependencies. Needs Tesseract and Imagemagick installed. - if not spawn.find_executable('tesseract'): - raise EnvironmentError('tesseract not installed.') - if not spawn.find_executable('convert'): - raise EnvironmentError('imagemagick not installed.') - if not spawn.find_executable('gs'): - raise EnvironmentError('ghostscript not installed.') + if not spawn.find_executable("tesseract"): + raise EnvironmentError("tesseract not installed.") + if not spawn.find_executable("convert"): + raise EnvironmentError("imagemagick not installed.") + if not spawn.find_executable("gs"): + raise EnvironmentError("ghostscript not installed.") - with tempfile.NamedTemporaryFile(suffix='.tiff') as tf: + with tempfile.NamedTemporaryFile(suffix=".tiff") as tf: # Step 1: Convert to TIFF gs_cmd = [ - 'gs', - '-q', - '-dNOPAUSE', - '-r600x600', - '-sDEVICE=tiff24nc', - '-sOutputFile=' + tf.name, + "gs", + "-q", + "-dNOPAUSE", + "-r600x600", + "-sDEVICE=tiff24nc", + "-sOutputFile=" + tf.name, path, - '-c', - 'quit', + "-c", + "quit", ] subprocess.Popen(gs_cmd) time.sleep(3) # Step 2: Enhance TIFF magick_cmd = [ - 'convert', + "convert", tf.name, - '-colorspace', - 'gray', - '-type', - 'grayscale', - '-contrast-stretch', - '0', - '-sharpen', - '0x1', - 'tiff:-', + "-colorspace", + "gray", + "-type", + "grayscale", + "-contrast-stretch", + "0", + "-sharpen", + "0x1", + "tiff:-", ] p1 = subprocess.Popen(magick_cmd, stdout=subprocess.PIPE) - tess_cmd = ['tesseract', '-l', language, '--oem', '1', '--psm', '3', 'stdin', 'stdout'] + tess_cmd = [ + "tesseract", + "-l", + language, + "--oem", + "1", + "--psm", + "3", + "stdin", + "stdout", + ] p2 = subprocess.Popen(tess_cmd, stdin=p1.stdout, stdout=subprocess.PIPE) out, err = p2.communicate() diff --git a/src/invoice2data/main.py b/src/invoice2data/main.py index a06063e0..d4b1074d 100644 --- a/src/invoice2data/main.py +++ b/src/invoice2data/main.py @@ -23,14 +23,14 @@ logger = logging.getLogger(__name__) input_mapping = { - 'pdftotext': pdftotext, - 'tesseract': tesseract, - 'tesseract4': tesseract4, - 'pdfminer': pdfminer_wrapper, - 'gvision': gvision, + "pdftotext": pdftotext, + "tesseract": tesseract, + "tesseract4": tesseract4, + "pdfminer": pdfminer_wrapper, + "gvision": gvision, } -output_mapping = {'csv': to_csv, 'json': to_json, 'xml': to_xml, 'none': None} +output_mapping = {"csv": to_csv, "json": to_json, "xml": to_xml, "none": None} def extract_data(invoicefile, templates=None, input_module=pdftotext): @@ -79,20 +79,20 @@ def extract_data(invoicefile, templates=None, input_module=pdftotext): templates = read_templates() # print(templates[0]) - extracted_str = input_module.to_text(invoicefile).decode('utf-8') + extracted_str = input_module.to_text(invoicefile).decode("utf-8") - logger.debug('START pdftotext result ===========================') + logger.debug("START pdftotext result ===========================") logger.debug(extracted_str) - logger.debug('END pdftotext result =============================') + logger.debug("END pdftotext result =============================") - logger.debug('Testing {} template files'.format(len(templates))) + logger.debug("Testing {} template files".format(len(templates))) for t in templates: optimized_str = t.prepare_input(extracted_str) if t.matches_input(optimized_str): return t.extract(optimized_str) - logger.error('No template for %s', invoicefile) + logger.error("No template for %s", invoicefile) return False @@ -100,75 +100,84 @@ def create_parser(): """Returns argument parser """ parser = argparse.ArgumentParser( - description='Extract structured data from PDF files and save to CSV or JSON.' + description="Extract structured data from PDF files and save to CSV or JSON." ) parser.add_argument( - '--input-reader', + "--input-reader", choices=input_mapping.keys(), - default='pdftotext', - help='Choose text extraction function. Default: pdftotext', + default="pdftotext", + help="Choose text extraction function. Default: pdftotext", ) parser.add_argument( - '--output-format', + "--output-format", choices=output_mapping.keys(), - default='none', - help='Choose output format. Default: none', + default="none", + help="Choose output format. Default: none", ) parser.add_argument( - '--output-date-format', - dest='output_date_format', + "--output-date-format", + dest="output_date_format", default="%Y-%m-%d", - help='Choose output date format. Default: %%Y-%%m-%%d (ISO 8601 Date)', + help="Choose output date format. Default: %%Y-%%m-%%d (ISO 8601 Date)", ) parser.add_argument( - '--output-name', - '-o', - dest='output_name', - default='invoices-output', - help='Custom name for output file. Extension is added based on chosen format.', + "--output-name", + "-o", + dest="output_name", + default="invoices-output", + help="Custom name for output file. Extension is added based on chosen format.", ) parser.add_argument( - '--debug', dest='debug', action='store_true', help='Enable debug information.' + "--debug", dest="debug", action="store_true", help="Enable debug information." ) parser.add_argument( - '--copy', '-c', dest='copy', help='Copy and rename processed PDFs to specified folder.' + "--copy", + "-c", + dest="copy", + help="Copy and rename processed PDFs to specified folder.", ) parser.add_argument( - '--move', '-m', dest='move', help='Move and rename processed PDFs to specified folder.' + "--move", + "-m", + dest="move", + help="Move and rename processed PDFs to specified folder.", ) parser.add_argument( - '--filename-format', - dest='filename', + "--filename-format", + dest="filename", default="{date} {invoice_number} {desc}.pdf", - help='Filename format to use when moving or copying processed PDFs.' - 'Default: "{date} {invoice_number} {desc}.pdf"', + help="Filename format to use when moving or copying processed PDFs." + 'Default: "{date} {invoice_number} {desc}.pdf"', ) parser.add_argument( - '--template-folder', - '-t', - dest='template_folder', - help='Folder containing invoice templates in yml file. Always adds built-in templates.', + "--template-folder", + "-t", + dest="template_folder", + help="Folder containing invoice templates in yml file. Always adds built-in templates.", ) parser.add_argument( - '--exclude-built-in-templates', - dest='exclude_built_in_templates', + "--exclude-built-in-templates", + dest="exclude_built_in_templates", default=False, - help='Ignore built-in templates.', + help="Ignore built-in templates.", action="store_true", ) parser.add_argument( - 'input_files', type=argparse.FileType('r'), nargs='+', help='File or directory to analyze.' + "input_files", + type=argparse.FileType("r"), + nargs="+", + help="File or directory to analyze.", ) return parser @@ -204,16 +213,16 @@ def main(args=None): output.append(res) if args.copy: filename = args.filename.format( - date=res['date'].strftime('%Y-%m-%d'), - invoice_number=res['invoice_number'], - desc=res['desc'], + date=res["date"].strftime("%Y-%m-%d"), + invoice_number=res["invoice_number"], + desc=res["desc"], ) shutil.copyfile(f.name, join(args.copy, filename)) if args.move: filename = args.filename.format( - date=res['date'].strftime('%Y-%m-%d'), - invoice_number=res['invoice_number'], - desc=res['desc'], + date=res["date"].strftime("%Y-%m-%d"), + invoice_number=res["invoice_number"], + desc=res["desc"], ) shutil.move(f.name, join(args.move, filename)) f.close() @@ -222,5 +231,5 @@ def main(args=None): output_module.write_to_file(output, args.output_name, args.output_date_format) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/invoice2data/output/to_csv.py b/src/invoice2data/output/to_csv.py index df37e1ec..68d13d21 100644 --- a/src/invoice2data/output/to_csv.py +++ b/src/invoice2data/output/to_csv.py @@ -27,18 +27,18 @@ def write_to_file(data, path, date_format="%Y-%m-%d"): >>> to_csv.write_to_file(data, "invoice.csv") """ - if path.endswith('.csv'): + if path.endswith(".csv"): filename = path else: - filename = path + '.csv' + filename = path + ".csv" if sys.version_info[0] < 3: openfile = open(filename, "wb") else: - openfile = open(filename, "w", newline='') + openfile = open(filename, "w", newline="") with openfile as csv_file: - writer = csv.writer(csv_file, delimiter=',') + writer = csv.writer(csv_file, delimiter=",") for line in data: first_row = [] @@ -50,7 +50,7 @@ def write_to_file(data, path, date_format="%Y-%m-%d"): csv_items = [] for k, v in line.items(): # first_row.append(k) - if k.startswith('date') or k.endswith('date'): + if k.startswith("date") or k.endswith("date"): v = v.strftime(date_format) csv_items.append(v) writer.writerow(csv_items) diff --git a/src/invoice2data/output/to_json.py b/src/invoice2data/output/to_json.py index 1e184d91..4c15aef5 100644 --- a/src/invoice2data/output/to_json.py +++ b/src/invoice2data/output/to_json.py @@ -34,18 +34,23 @@ def write_to_file(data, path, date_format="%Y-%m-%d"): >>> to_json.write_to_file(data, "invoice.json") """ - if path.endswith('.json'): + if path.endswith(".json"): filename = path else: - filename = path + '.json' + filename = path + ".json" - with codecs.open(filename, "w", encoding='utf-8') as json_file: + with codecs.open(filename, "w", encoding="utf-8") as json_file: for line in data: for k, v in line.items(): - if k.startswith('date') or k.endswith('date'): + if k.startswith("date") or k.endswith("date"): line[k] = v.strftime(date_format) print(type(json)) print(json) json.dump( - data, json_file, indent=4, sort_keys=True, default=myconverter, ensure_ascii=False + data, + json_file, + indent=4, + sort_keys=True, + default=myconverter, + ensure_ascii=False, ) diff --git a/src/invoice2data/output/to_xml.py b/src/invoice2data/output/to_xml.py index 06dd00e2..30e4ebc4 100644 --- a/src/invoice2data/output/to_xml.py +++ b/src/invoice2data/output/to_xml.py @@ -4,12 +4,12 @@ def prettify(elem): """Return a pretty-printed XML string for the Element.""" - rough_string = ET.tostring(elem, 'utf-8') + rough_string = ET.tostring(elem, "utf-8") reparsed = minidom.parseString(rough_string) return reparsed.toprettyxml(indent=" ") -def write_to_file(data, path, date_format='%Y-%m-%d'): +def write_to_file(data, path, date_format="%Y-%m-%d"): """Export extracted fields to xml Appends .xml to path if missing and generates xml file in specified directory, if not then in root @@ -36,26 +36,26 @@ def write_to_file(data, path, date_format='%Y-%m-%d'): """ - if path.endswith('.xml'): + if path.endswith(".xml"): filename = path else: - filename = path + '.xml' + filename = path + ".xml" - tag_data = ET.Element('data') + tag_data = ET.Element("data") xml_file = open(filename, "w") i = 0 for line in data: i += 1 - tag_item = ET.SubElement(tag_data, 'item') - tag_date = ET.SubElement(tag_item, 'date') - tag_desc = ET.SubElement(tag_item, 'desc') - tag_currency = ET.SubElement(tag_item, 'currency') - tag_amount = ET.SubElement(tag_item, 'amount') - tag_item.set('id', str(i)) - tag_date.text = line['date'].strftime(date_format) - tag_desc.text = line['desc'] - tag_currency.text = line['currency'] - tag_amount.text = str(line['amount']) + tag_item = ET.SubElement(tag_data, "item") + tag_date = ET.SubElement(tag_item, "date") + tag_desc = ET.SubElement(tag_item, "desc") + tag_currency = ET.SubElement(tag_item, "currency") + tag_amount = ET.SubElement(tag_item, "amount") + tag_item.set("id", str(i)) + tag_date.text = line["date"].strftime(date_format) + tag_desc.text = line["desc"] + tag_currency.text = line["currency"] + tag_amount.text = str(line["amount"]) xml_file.write(prettify(tag_data)) xml_file.close()