Skip to content

Commit

Permalink
Rework IACR fetcher (#8904)
Browse files Browse the repository at this point in the history
  • Loading branch information
LIM0000 authored Jun 20, 2022
1 parent 3475ec9 commit 41edd28
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 97 deletions.
104 changes: 18 additions & 86 deletions src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,8 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.time.DateTimeException;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.temporal.TemporalAccessor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.importer.FetcherException;
Expand All @@ -26,25 +17,16 @@
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IacrEprintFetcher implements IdBasedFetcher {

public static final String NAME = "IACR eprints";

private static final Logger LOGGER = LoggerFactory.getLogger(IacrEprintFetcher.class);
private static final Pattern DATE_FROM_WEBSITE_AFTER_2000_PATTERN = Pattern.compile("[a-z ]+(\\d{1,2} [A-Za-z][a-z]{2} \\d{4})");
private static final Pattern DATE_FROM_WEBSITE_BEFORE_2000_PATTERN = Pattern.compile("[A-Za-z ]+? ([A-Za-z][a-z]{2,10} \\d{1,2}(th|st|nd|rd)?, \\d{4})\\.?");
private static final Pattern WITHOUT_LETTERS_SPACE = Pattern.compile("[^0-9/]");

private static final DateTimeFormatter DATE_FORMAT_WEBSITE_AFTER_2000 = DateTimeFormatter.ofPattern("d MMM yyyy", Locale.US);
private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_LONG_MONTHS = DateTimeFormatter.ofPattern("MMMM d['th']['st']['nd']['rd'] yyyy", Locale.US);
private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS = DateTimeFormatter.ofPattern("MMM d['th']['st']['nd']['rd'] yyyy", Locale.US);
private static final DateTimeFormatter DATE_FORMAT_BIBTEX = DateTimeFormatter.ISO_LOCAL_DATE;
private static final Predicate<String> IDENTIFIER_PREDICATE = Pattern.compile("\\d{4}/\\d{3,5}").asPredicate();
private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/eprint-bin/cite.pl?entry=";
private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/";
private static final String DESCRIPTION_URL_PREFIX = "https://eprint.iacr.org/";
private static final String VERSION_URL_PREFIX = "https://eprint.iacr.org/archive/versions/";

private final ImportFormatPreferences prefs;

Expand Down Expand Up @@ -74,7 +56,7 @@ private Optional<BibEntry> createEntryFromIacrCitation(String validIdentifier) t
if (bibtexCitationHtml.contains("No such report found")) {
throw new FetcherException(Localization.lang("No results found."));
}
String actualEntry = getRequiredValueBetween("<pre>", "</pre>", bibtexCitationHtml);
String actualEntry = getRequiredValueBetween("<pre id=\"bibtex\">", "</pre>", bibtexCitationHtml);

try {
return BibtexParser.singleFromString(actualEntry, prefs, new DummyFileUpdateMonitor());
Expand All @@ -86,86 +68,36 @@ private Optional<BibEntry> createEntryFromIacrCitation(String validIdentifier) t
private void setAdditionalFields(BibEntry entry, String identifier) throws FetcherException {
String entryUrl = DESCRIPTION_URL_PREFIX + identifier;
String descriptiveHtml = getHtml(entryUrl);

entry.setField(StandardField.ABSTRACT, getAbstract(descriptiveHtml));
String dateStringAsInHtml = getRequiredValueBetween("<b>Date: </b>", "<p />", descriptiveHtml);
entry.setField(StandardField.DATE, getLatestDate(dateStringAsInHtml));
entry.setField(StandardField.DATE, getDate(descriptiveHtml));

// Version information for entries after year 2000
if (isFromOrAfterYear2000(entry)) {
String version = getVersion(identifier, descriptiveHtml);
String entryVersion = VERSION_URL_PREFIX + identifier;
String versionHtml = getHtml(entryVersion);
String version = getVersion(identifier, versionHtml);
entry.setField(StandardField.VERSION, version);
entry.setField(StandardField.URL, entryUrl + "/" + version);
} else {
// No version information for entries before year 2000
entry.setField(StandardField.URL, entryUrl);
}
}

private String getVersion(String identifier, String descriptiveHtml) throws FetcherException {
String startOfVersionString = "<b>Version: </b><a href=\"/" + identifier + "/";
String version = getRequiredValueBetween(startOfVersionString, "\"", descriptiveHtml);
private String getVersion(String identifier, String versionHtml) throws FetcherException {
String startOfVersionString = "<li><a href=\"/archive/" + identifier + "/";
String version = getRequiredValueBetween(startOfVersionString, "\">", versionHtml);
return version;
}

private String getAbstract(String descriptiveHtml) throws FetcherException {
String abstractText = getRequiredValueBetween("<b>Abstract: </b>", "<p />", descriptiveHtml);
// for some reason, all spaces are doubled...
abstractText = abstractText.replaceAll("\\s(\\s)", "$1");
String startOfAbstractString = "<h5 class=\"mt-3\">Abstract</h5>\n <p style=\"white-space: pre-wrap;\">";
String abstractText = getRequiredValueBetween(startOfAbstractString, "</p>", descriptiveHtml);
return abstractText;
}

private String getLatestDate(String dateStringAsInHtml) throws FetcherException {
if (dateStringAsInHtml.contains("withdrawn")) {
throw new FetcherException(Localization.lang("This paper has been withdrawn."));
}
String[] rawDates = dateStringAsInHtml.split(", \\D");
List<String> formattedDates = new ArrayList<>();
for (String rawDate : rawDates) {
TemporalAccessor date = parseSingleDateFromWebsite(rawDate);
if (date != null) {
formattedDates.add(DATE_FORMAT_BIBTEX.format(date));
}
}

if (formattedDates.isEmpty()) {
throw new FetcherException(Localization.lang("Entry from %0 could not be parsed.", "IACR"));
}

Collections.sort(formattedDates, Collections.reverseOrder());
return formattedDates.get(0);
}

private TemporalAccessor parseSingleDateFromWebsite(String dateStringFromWebsite) {
TemporalAccessor date = null;
// Some entries contain double spaces in the date string (which would break our regexs below)
String dateStringWithoutDoubleSpaces = dateStringFromWebsite.replaceAll("\\s\\s+", " ");

Matcher dateMatcherAfter2000 = DATE_FROM_WEBSITE_AFTER_2000_PATTERN.matcher(dateStringWithoutDoubleSpaces.trim());
if (dateMatcherAfter2000.find()) {
try {
date = DATE_FORMAT_WEBSITE_AFTER_2000.parse(dateMatcherAfter2000.group(1));
} catch (DateTimeParseException e) {
LOGGER.warn("Date from IACR could not be parsed", e);
}
}

// Entries before year 2000 use a variety of date formats - fortunately, we can match them with only two different
// date formats (each of which differ from the unified format of post-2000 entries).
Matcher dateMatcherBefore2000 = DATE_FROM_WEBSITE_BEFORE_2000_PATTERN.matcher(dateStringWithoutDoubleSpaces.trim());
if (dateMatcherBefore2000.find()) {
String dateWithoutComma = dateMatcherBefore2000.group(1).replace(",", "");
try {
date = DATE_FORMAT_WEBSITE_BEFORE_2000_LONG_MONTHS.parse(dateWithoutComma);
} catch (DateTimeParseException e) {
try {
date = DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS.parse(dateWithoutComma);
} catch (DateTimeException e1) {
LOGGER.warn("Date from IACR could not be parsed", e);
LOGGER.warn("Date from IACR could not be parsed", e1);
}
}
}

return date;
private String getDate(String descriptiveHtml) throws FetcherException {
String startOfHistoryString = "<dt>History</dt>\n \n \n <dd>";
String dateStringAsInHtml = getRequiredValueBetween(startOfHistoryString, ":", descriptiveHtml);
return dateStringAsInHtml;
}

private String getHtml(String url) throws FetcherException {
Expand Down
1 change: 0 additions & 1 deletion src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1868,7 +1868,6 @@ Removes\ all\ hyphenated\ line\ breaks\ in\ the\ field\ content.=Removes all hyp
Could\ not\ retrieve\ entry\ data\ from\ '%0'.=Could not retrieve entry data from '%0'.
Entry\ from\ %0\ could\ not\ be\ parsed.=Entry from %0 could not be parsed.
Invalid\ identifier\:\ '%0'.=Invalid identifier: '%0'.
This\ paper\ has\ been\ withdrawn.=This paper has been withdrawn.
empty\ citation\ key=empty citation key
Aux\ file=Aux file
Group\ containing\ entries\ cited\ in\ a\ given\ TeX\ file=Group containing entries cited in a given TeX file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,36 +44,37 @@ public void setUp() {
fetcher = new IacrEprintFetcher(mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS));

abram2017 = new BibEntry(StandardEntryType.Misc)
.withCitationKey("cryptoeprint:2017:1118")
.withCitationKey("cryptoeprint:2017/1118")
.withField(StandardField.ABSTRACT, "dummy")
.withField(StandardField.AUTHOR, "Ittai Abraham and Dahlia Malkhi and Kartik Nayak and Ling Ren and Alexander Spiegelman")
.withField(StandardField.DATE, "2017-11-18")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2017/1118")
.withField(StandardField.NOTE, "\\url{https://ia.cr/2017/1118}")
.withField(StandardField.DATE, "2017-11-24")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2017/1118")
.withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2017/1118}")
.withField(StandardField.TITLE, "Solida: A Blockchain Protocol Based on Reconfigurable Byzantine Consensus")
.withField(StandardField.URL, "https://eprint.iacr.org/2017/1118/20171124:064527")
.withField(StandardField.VERSION, "20171124:064527")
.withField(StandardField.YEAR, "2017");

beierle2016 = new BibEntry(StandardEntryType.Misc)
.withCitationKey("cryptoeprint:2016:119")
.withCitationKey("cryptoeprint:2016/119")
.withField(StandardField.ABSTRACT, "dummy")
.withField(StandardField.AUTHOR, "Christof Beierle and Thorsten Kranz and Gregor Leander")
.withField(StandardField.DATE, "2017-02-17")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2016/119")
.withField(StandardField.NOTE, "\\url{https://ia.cr/2016/119}")
.withField(StandardField.DOI, "10.1007/978-3-662-53018-4_23")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2016/119")
.withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2016/119}")
.withField(StandardField.TITLE, "Lightweight Multiplication in GF(2^n) with Applications to MDS Matrices")
.withField(StandardField.URL, "https://eprint.iacr.org/2016/119/20170217:150415")
.withField(StandardField.VERSION, "20170217:150415")
.withField(StandardField.YEAR, "2016");

delgado2017 = new BibEntry(StandardEntryType.Misc)
.withCitationKey("cryptoeprint:2017:1095")
.withCitationKey("cryptoeprint:2017/1095")
.withField(StandardField.ABSTRACT, "dummy")
.withField(StandardField.AUTHOR, "Sergi Delgado-Segura and Cristina Pérez-Solà and Guillermo Navarro-Arribas and Jordi Herrera-Joancomartí")
.withField(StandardField.DATE, "2018-01-19")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2017/1095")
.withField(StandardField.NOTE, "\\url{https://ia.cr/2017/1095}")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2017/1095")
.withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2017/1095}")
.withField(StandardField.TITLE, "Analysis of the Bitcoin UTXO set")
.withField(StandardField.URL, "https://eprint.iacr.org/2017/1095/20180119:113352")
.withField(StandardField.VERSION, "20180119:113352")
Expand Down

0 comments on commit 41edd28

Please sign in to comment.