From 3ad96e4ee2fd980889f34027daf0ccc67dc1a308 Mon Sep 17 00:00:00 2001 From: LIM0000 Date: Wed, 15 Jun 2022 20:10:20 +0930 Subject: [PATCH 1/4] Fix 8876 by reworking ICAR fetcher --- .../importer/fetcher/IacrEprintFetcher.java | 92 +++---------------- 1 file changed, 15 insertions(+), 77 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java index 6eae4cf35c2..6c65a25513f 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java @@ -1,17 +1,10 @@ package org.jabref.logic.importer.fetcher; import java.io.IOException; -import java.time.DateTimeException; import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.time.temporal.TemporalAccessor; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; import java.util.Locale; import java.util.Optional; import java.util.function.Predicate; -import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jabref.logic.importer.FetcherException; @@ -43,8 +36,9 @@ public class IacrEprintFetcher implements IdBasedFetcher { private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS = DateTimeFormatter.ofPattern("MMM d['th']['st']['nd']['rd'] yyyy", Locale.US); private static final DateTimeFormatter DATE_FORMAT_BIBTEX = DateTimeFormatter.ISO_LOCAL_DATE; private static final Predicate IDENTIFIER_PREDICATE = Pattern.compile("\\d{4}/\\d{3,5}").asPredicate(); - private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/eprint-bin/cite.pl?entry="; + private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/"; private static final String DESCRIPTION_URL_PREFIX = "https://eprint.iacr.org/"; + private static final String VERSION_URL_PREFIX = "https://eprint.iacr.org/archive/versions/"; private final ImportFormatPreferences prefs; @@ -74,7 +68,7 @@ private Optional createEntryFromIacrCitation(String validIdentifier) t if (bibtexCitationHtml.contains("No such report found")) { throw new FetcherException(Localization.lang("No results found.")); } - String actualEntry = getRequiredValueBetween("
", "
", bibtexCitationHtml); + String actualEntry = getRequiredValueBetween("
", "
", bibtexCitationHtml); try { return BibtexParser.singleFromString(actualEntry, prefs, new DummyFileUpdateMonitor()); @@ -86,88 +80,32 @@ private Optional createEntryFromIacrCitation(String validIdentifier) t private void setAdditionalFields(BibEntry entry, String identifier) throws FetcherException { String entryUrl = DESCRIPTION_URL_PREFIX + identifier; String descriptiveHtml = getHtml(entryUrl); + entry.setField(StandardField.ABSTRACT, getAbstract(descriptiveHtml)); - String dateStringAsInHtml = getRequiredValueBetween("Date: ", "

", descriptiveHtml); - entry.setField(StandardField.DATE, getLatestDate(dateStringAsInHtml)); + String dateStringAsInHtml = getRequiredValueBetween("

History
" + "\n \n \n " + "
", ":", descriptiveHtml); + entry.setField(StandardField.DATE, dateStringAsInHtml); + // Version information for entries after year 2000 if (isFromOrAfterYear2000(entry)) { - String version = getVersion(identifier, descriptiveHtml); + String entryVersion = VERSION_URL_PREFIX + identifier; + String versionHtml = getHtml(entryVersion); + String version = getVersion(identifier, versionHtml); entry.setField(StandardField.VERSION, version); entry.setField(StandardField.URL, entryUrl + "/" + version); - } else { - // No version information for entries before year 2000 - entry.setField(StandardField.URL, entryUrl); } } - private String getVersion(String identifier, String descriptiveHtml) throws FetcherException { - String startOfVersionString = "Version: ", versionHtml); return version; } private String getAbstract(String descriptiveHtml) throws FetcherException { - String abstractText = getRequiredValueBetween("Abstract: ", "

", descriptiveHtml); - // for some reason, all spaces are doubled... - abstractText = abstractText.replaceAll("\\s(\\s)", "$1"); + String abstractText = getRequiredValueBetween("

Abstract
" + "\n " + "

", "

", descriptiveHtml); return abstractText; } - - private String getLatestDate(String dateStringAsInHtml) throws FetcherException { - if (dateStringAsInHtml.contains("withdrawn")) { - throw new FetcherException(Localization.lang("This paper has been withdrawn.")); - } - String[] rawDates = dateStringAsInHtml.split(", \\D"); - List formattedDates = new ArrayList<>(); - for (String rawDate : rawDates) { - TemporalAccessor date = parseSingleDateFromWebsite(rawDate); - if (date != null) { - formattedDates.add(DATE_FORMAT_BIBTEX.format(date)); - } - } - - if (formattedDates.isEmpty()) { - throw new FetcherException(Localization.lang("Entry from %0 could not be parsed.", "IACR")); - } - - Collections.sort(formattedDates, Collections.reverseOrder()); - return formattedDates.get(0); - } - - private TemporalAccessor parseSingleDateFromWebsite(String dateStringFromWebsite) { - TemporalAccessor date = null; - // Some entries contain double spaces in the date string (which would break our regexs below) - String dateStringWithoutDoubleSpaces = dateStringFromWebsite.replaceAll("\\s\\s+", " "); - - Matcher dateMatcherAfter2000 = DATE_FROM_WEBSITE_AFTER_2000_PATTERN.matcher(dateStringWithoutDoubleSpaces.trim()); - if (dateMatcherAfter2000.find()) { - try { - date = DATE_FORMAT_WEBSITE_AFTER_2000.parse(dateMatcherAfter2000.group(1)); - } catch (DateTimeParseException e) { - LOGGER.warn("Date from IACR could not be parsed", e); - } - } - - // Entries before year 2000 use a variety of date formats - fortunately, we can match them with only two different - // date formats (each of which differ from the unified format of post-2000 entries). - Matcher dateMatcherBefore2000 = DATE_FROM_WEBSITE_BEFORE_2000_PATTERN.matcher(dateStringWithoutDoubleSpaces.trim()); - if (dateMatcherBefore2000.find()) { - String dateWithoutComma = dateMatcherBefore2000.group(1).replace(",", ""); - try { - date = DATE_FORMAT_WEBSITE_BEFORE_2000_LONG_MONTHS.parse(dateWithoutComma); - } catch (DateTimeParseException e) { - try { - date = DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS.parse(dateWithoutComma); - } catch (DateTimeException e1) { - LOGGER.warn("Date from IACR could not be parsed", e); - LOGGER.warn("Date from IACR could not be parsed", e1); - } - } - } - - return date; - } - + private String getHtml(String url) throws FetcherException { try { URLDownload download = new URLDownload(url); From 0c58fac794683ea8d449e5725decff7cb92725eb Mon Sep 17 00:00:00 2001 From: LIM0000 Date: Wed, 15 Jun 2022 23:15:39 +0930 Subject: [PATCH 2/4] Update LocalizationKeys --- src/main/resources/l10n/JabRef_en.properties | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index 1cc2c330268..368b722e913 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -1868,7 +1868,6 @@ Removes\ all\ hyphenated\ line\ breaks\ in\ the\ field\ content.=Removes all hyp Could\ not\ retrieve\ entry\ data\ from\ '%0'.=Could not retrieve entry data from '%0'. Entry\ from\ %0\ could\ not\ be\ parsed.=Entry from %0 could not be parsed. Invalid\ identifier\:\ '%0'.=Invalid identifier: '%0'. -This\ paper\ has\ been\ withdrawn.=This paper has been withdrawn. empty\ citation\ key=empty citation key Aux\ file=Aux file Group\ containing\ entries\ cited\ in\ a\ given\ TeX\ file=Group containing entries cited in a given TeX file From 80af9b5ab683617eb042ea2b0d0b0ded253fc276 Mon Sep 17 00:00:00 2001 From: LIM0000 Date: Sat, 18 Jun 2022 12:19:43 +0930 Subject: [PATCH 3/4] Clean up unnecessary variables and create getDate() function --- .../importer/fetcher/IacrEprintFetcher.java | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java index 6c65a25513f..42777780515 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java @@ -1,8 +1,6 @@ package org.jabref.logic.importer.fetcher; import java.io.IOException; -import java.time.format.DateTimeFormatter; -import java.util.Locale; import java.util.Optional; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -19,22 +17,12 @@ import org.jabref.model.strings.StringUtil; import org.jabref.model.util.DummyFileUpdateMonitor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - public class IacrEprintFetcher implements IdBasedFetcher { public static final String NAME = "IACR eprints"; - private static final Logger LOGGER = LoggerFactory.getLogger(IacrEprintFetcher.class); - private static final Pattern DATE_FROM_WEBSITE_AFTER_2000_PATTERN = Pattern.compile("[a-z ]+(\\d{1,2} [A-Za-z][a-z]{2} \\d{4})"); - private static final Pattern DATE_FROM_WEBSITE_BEFORE_2000_PATTERN = Pattern.compile("[A-Za-z ]+? ([A-Za-z][a-z]{2,10} \\d{1,2}(th|st|nd|rd)?, \\d{4})\\.?"); private static final Pattern WITHOUT_LETTERS_SPACE = Pattern.compile("[^0-9/]"); - private static final DateTimeFormatter DATE_FORMAT_WEBSITE_AFTER_2000 = DateTimeFormatter.ofPattern("d MMM yyyy", Locale.US); - private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_LONG_MONTHS = DateTimeFormatter.ofPattern("MMMM d['th']['st']['nd']['rd'] yyyy", Locale.US); - private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS = DateTimeFormatter.ofPattern("MMM d['th']['st']['nd']['rd'] yyyy", Locale.US); - private static final DateTimeFormatter DATE_FORMAT_BIBTEX = DateTimeFormatter.ISO_LOCAL_DATE; private static final Predicate IDENTIFIER_PREDICATE = Pattern.compile("\\d{4}/\\d{3,5}").asPredicate(); private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/"; private static final String DESCRIPTION_URL_PREFIX = "https://eprint.iacr.org/"; @@ -82,8 +70,7 @@ private void setAdditionalFields(BibEntry entry, String identifier) throws Fetch String descriptiveHtml = getHtml(entryUrl); entry.setField(StandardField.ABSTRACT, getAbstract(descriptiveHtml)); - String dateStringAsInHtml = getRequiredValueBetween("
History
" + "\n \n \n " + "
", ":", descriptiveHtml); - entry.setField(StandardField.DATE, dateStringAsInHtml); + entry.setField(StandardField.DATE, getDate(descriptiveHtml)); // Version information for entries after year 2000 if (isFromOrAfterYear2000(entry)) { @@ -102,10 +89,17 @@ private String getVersion(String identifier, String versionHtml) throws FetcherE } private String getAbstract(String descriptiveHtml) throws FetcherException { - String abstractText = getRequiredValueBetween("
Abstract
" + "\n " + "

", "

", descriptiveHtml); + String startOfAbstractString = "
Abstract
\n

"; + String abstractText = getRequiredValueBetween(startOfAbstractString, "

", descriptiveHtml); return abstractText; } - + + private String getDate(String descriptiveHtml) throws FetcherException { + String startOfHistoryString = "
History
\n \n \n
"; + String dateStringAsInHtml = getRequiredValueBetween(startOfHistoryString, ":", descriptiveHtml); + return dateStringAsInHtml; + } + private String getHtml(String url) throws FetcherException { try { URLDownload download = new URLDownload(url); From 0d09116bbebef3a77f71452cbca05b9301066563 Mon Sep 17 00:00:00 2001 From: LIM0000 Date: Mon, 20 Jun 2022 06:22:06 +0930 Subject: [PATCH 4/4] Update Iacr fetcher tests --- .../fetcher/IacrEprintFetcherTest.java | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/test/java/org/jabref/logic/importer/fetcher/IacrEprintFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/IacrEprintFetcherTest.java index 43ca28eaaa7..bf5bf9eef3a 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/IacrEprintFetcherTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/IacrEprintFetcherTest.java @@ -44,36 +44,37 @@ public void setUp() { fetcher = new IacrEprintFetcher(mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS)); abram2017 = new BibEntry(StandardEntryType.Misc) - .withCitationKey("cryptoeprint:2017:1118") + .withCitationKey("cryptoeprint:2017/1118") .withField(StandardField.ABSTRACT, "dummy") .withField(StandardField.AUTHOR, "Ittai Abraham and Dahlia Malkhi and Kartik Nayak and Ling Ren and Alexander Spiegelman") - .withField(StandardField.DATE, "2017-11-18") - .withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2017/1118") - .withField(StandardField.NOTE, "\\url{https://ia.cr/2017/1118}") + .withField(StandardField.DATE, "2017-11-24") + .withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2017/1118") + .withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2017/1118}") .withField(StandardField.TITLE, "Solida: A Blockchain Protocol Based on Reconfigurable Byzantine Consensus") .withField(StandardField.URL, "https://eprint.iacr.org/2017/1118/20171124:064527") .withField(StandardField.VERSION, "20171124:064527") .withField(StandardField.YEAR, "2017"); beierle2016 = new BibEntry(StandardEntryType.Misc) - .withCitationKey("cryptoeprint:2016:119") + .withCitationKey("cryptoeprint:2016/119") .withField(StandardField.ABSTRACT, "dummy") .withField(StandardField.AUTHOR, "Christof Beierle and Thorsten Kranz and Gregor Leander") .withField(StandardField.DATE, "2017-02-17") - .withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2016/119") - .withField(StandardField.NOTE, "\\url{https://ia.cr/2016/119}") + .withField(StandardField.DOI, "10.1007/978-3-662-53018-4_23") + .withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2016/119") + .withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2016/119}") .withField(StandardField.TITLE, "Lightweight Multiplication in GF(2^n) with Applications to MDS Matrices") .withField(StandardField.URL, "https://eprint.iacr.org/2016/119/20170217:150415") .withField(StandardField.VERSION, "20170217:150415") .withField(StandardField.YEAR, "2016"); delgado2017 = new BibEntry(StandardEntryType.Misc) - .withCitationKey("cryptoeprint:2017:1095") + .withCitationKey("cryptoeprint:2017/1095") .withField(StandardField.ABSTRACT, "dummy") .withField(StandardField.AUTHOR, "Sergi Delgado-Segura and Cristina Pérez-Solà and Guillermo Navarro-Arribas and Jordi Herrera-Joancomartí") .withField(StandardField.DATE, "2018-01-19") - .withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2017/1095") - .withField(StandardField.NOTE, "\\url{https://ia.cr/2017/1095}") + .withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2017/1095") + .withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2017/1095}") .withField(StandardField.TITLE, "Analysis of the Bitcoin UTXO set") .withField(StandardField.URL, "https://eprint.iacr.org/2017/1095/20180119:113352") .withField(StandardField.VERSION, "20180119:113352")