diff --git a/pdf2bib/bibtex_makers.py b/pdf2bib/bibtex_makers.py index 995c4c2..fe7eb7a 100644 --- a/pdf2bib/bibtex_makers.py +++ b/pdf2bib/bibtex_makers.py @@ -178,7 +178,9 @@ def make_bibtex(metadata): id = id.lower() id = remove_latex_codes(id) id = unidecode(id) #This makes sure that the id of the bibtex entry is only made out of ascii characters (i.e. no accents, tildes, etc.) - id = re.sub('-|,', '', id) #Make sure to remove any possible hyphen and comma + id = re.sub( + "-|,|:|'|\\n", "", id + ) # Make sure to remove any possible hyphen, comma, colon, single quote, and newline. if id == '': id = 'NoValidID' @@ -215,4 +217,4 @@ def remove_latex_codes(text): #This regex looks for any substring that matches the pattern "{\string1{string2}}" where string1 can be anything, #and it replaces the whole substring by string2 text_sanitized = re.sub(r"{\\[^\{]+{([\w]+)}}", r"\1",text) - return text_sanitized \ No newline at end of file + return text_sanitized diff --git a/pdf2bib/main.py b/pdf2bib/main.py index 2753047..79e507a 100644 --- a/pdf2bib/main.py +++ b/pdf2bib/main.py @@ -46,6 +46,9 @@ def pdf2bib(target): # Setup logging logger = logging.getLogger("pdf2bib") + # Make sure the path is a string in case a Pathlib object is provided + target = str(target) + #Check if path is valid if not(path.exists(target)): logger.error(f"{target} is not a valid path to a file or a directory.") @@ -135,9 +138,18 @@ def pdf2bib_singlefile(filename): logger.info(f"pdf2doi found a valid identifier for this paper.") - if result['identifier_type'] in ['arxiv ID','arxiv DOI']: + if result["identifier_type"] == "arxiv ID": + logger.info(f"Parsing the info returned by export.arxiv.org...") + metadata = bibtex_makers.parse_bib_from_exportarxivorg( + result["validation_info"] + ) + elif result["identifier_type"] == "arxiv DOI": + if "arxiv_doi" not in result["validation_info"]: + result["validation_info"]["arxiv_doi"] = result["identifier"] logger.info(f"Parsing the info returned by export.arxiv.org...") - metadata = bibtex_makers.parse_bib_from_exportarxivorg(result['validation_info']) + metadata = bibtex_makers.parse_bib_from_exportarxivorg( + result["validation_info"] + ) elif result['identifier_type'] == 'DOI': logger.info(f"Parsing the info returned by dx.doi.org...") metadata = bibtex_makers.parse_bib_from_dxdoiorg(result['validation_info'], method=pdf2doi.config.get('method_dxdoiorg')) @@ -303,4 +315,4 @@ def main(): return if __name__ == '__main__': - main() \ No newline at end of file + main()