Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework IACR fetcher #8904

Merged
merged 4 commits into from
Jun 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 18 additions & 86 deletions src/main/java/org/jabref/logic/importer/fetcher/IacrEprintFetcher.java
Original file line number Diff line number Diff line change
@@ -1,17 +1,8 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.time.DateTimeException;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.temporal.TemporalAccessor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.importer.FetcherException;
Expand All @@ -26,25 +17,16 @@
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IacrEprintFetcher implements IdBasedFetcher {

public static final String NAME = "IACR eprints";

private static final Logger LOGGER = LoggerFactory.getLogger(IacrEprintFetcher.class);
private static final Pattern DATE_FROM_WEBSITE_AFTER_2000_PATTERN = Pattern.compile("[a-z ]+(\\d{1,2} [A-Za-z][a-z]{2} \\d{4})");
private static final Pattern DATE_FROM_WEBSITE_BEFORE_2000_PATTERN = Pattern.compile("[A-Za-z ]+? ([A-Za-z][a-z]{2,10} \\d{1,2}(th|st|nd|rd)?, \\d{4})\\.?");
private static final Pattern WITHOUT_LETTERS_SPACE = Pattern.compile("[^0-9/]");

private static final DateTimeFormatter DATE_FORMAT_WEBSITE_AFTER_2000 = DateTimeFormatter.ofPattern("d MMM yyyy", Locale.US);
private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_LONG_MONTHS = DateTimeFormatter.ofPattern("MMMM d['th']['st']['nd']['rd'] yyyy", Locale.US);
private static final DateTimeFormatter DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS = DateTimeFormatter.ofPattern("MMM d['th']['st']['nd']['rd'] yyyy", Locale.US);
private static final DateTimeFormatter DATE_FORMAT_BIBTEX = DateTimeFormatter.ISO_LOCAL_DATE;
private static final Predicate<String> IDENTIFIER_PREDICATE = Pattern.compile("\\d{4}/\\d{3,5}").asPredicate();
private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/eprint-bin/cite.pl?entry=";
private static final String CITATION_URL_PREFIX = "https://eprint.iacr.org/";
private static final String DESCRIPTION_URL_PREFIX = "https://eprint.iacr.org/";
private static final String VERSION_URL_PREFIX = "https://eprint.iacr.org/archive/versions/";

private final ImportFormatPreferences prefs;

Expand Down Expand Up @@ -74,7 +56,7 @@ private Optional<BibEntry> createEntryFromIacrCitation(String validIdentifier) t
if (bibtexCitationHtml.contains("No such report found")) {
throw new FetcherException(Localization.lang("No results found."));
}
String actualEntry = getRequiredValueBetween("<pre>", "</pre>", bibtexCitationHtml);
String actualEntry = getRequiredValueBetween("<pre id=\"bibtex\">", "</pre>", bibtexCitationHtml);

try {
return BibtexParser.singleFromString(actualEntry, prefs, new DummyFileUpdateMonitor());
Expand All @@ -86,86 +68,36 @@ private Optional<BibEntry> createEntryFromIacrCitation(String validIdentifier) t
private void setAdditionalFields(BibEntry entry, String identifier) throws FetcherException {
String entryUrl = DESCRIPTION_URL_PREFIX + identifier;
String descriptiveHtml = getHtml(entryUrl);

entry.setField(StandardField.ABSTRACT, getAbstract(descriptiveHtml));
String dateStringAsInHtml = getRequiredValueBetween("<b>Date: </b>", "<p />", descriptiveHtml);
entry.setField(StandardField.DATE, getLatestDate(dateStringAsInHtml));
entry.setField(StandardField.DATE, getDate(descriptiveHtml));

// Version information for entries after year 2000
if (isFromOrAfterYear2000(entry)) {
String version = getVersion(identifier, descriptiveHtml);
String entryVersion = VERSION_URL_PREFIX + identifier;
String versionHtml = getHtml(entryVersion);
String version = getVersion(identifier, versionHtml);
entry.setField(StandardField.VERSION, version);
entry.setField(StandardField.URL, entryUrl + "/" + version);
} else {
// No version information for entries before year 2000
entry.setField(StandardField.URL, entryUrl);
}
}

private String getVersion(String identifier, String descriptiveHtml) throws FetcherException {
String startOfVersionString = "<b>Version: </b><a href=\"/" + identifier + "/";
String version = getRequiredValueBetween(startOfVersionString, "\"", descriptiveHtml);
private String getVersion(String identifier, String versionHtml) throws FetcherException {
String startOfVersionString = "<li><a href=\"/archive/" + identifier + "/";
String version = getRequiredValueBetween(startOfVersionString, "\">", versionHtml);
return version;
}

private String getAbstract(String descriptiveHtml) throws FetcherException {
String abstractText = getRequiredValueBetween("<b>Abstract: </b>", "<p />", descriptiveHtml);
// for some reason, all spaces are doubled...
abstractText = abstractText.replaceAll("\\s(\\s)", "$1");
String startOfAbstractString = "<h5 class=\"mt-3\">Abstract</h5>\n <p style=\"white-space: pre-wrap;\">";
String abstractText = getRequiredValueBetween(startOfAbstractString, "</p>", descriptiveHtml);
return abstractText;
}

private String getLatestDate(String dateStringAsInHtml) throws FetcherException {
if (dateStringAsInHtml.contains("withdrawn")) {
throw new FetcherException(Localization.lang("This paper has been withdrawn."));
}
String[] rawDates = dateStringAsInHtml.split(", \\D");
List<String> formattedDates = new ArrayList<>();
for (String rawDate : rawDates) {
TemporalAccessor date = parseSingleDateFromWebsite(rawDate);
if (date != null) {
formattedDates.add(DATE_FORMAT_BIBTEX.format(date));
}
}

if (formattedDates.isEmpty()) {
throw new FetcherException(Localization.lang("Entry from %0 could not be parsed.", "IACR"));
}

Collections.sort(formattedDates, Collections.reverseOrder());
return formattedDates.get(0);
}

private TemporalAccessor parseSingleDateFromWebsite(String dateStringFromWebsite) {
TemporalAccessor date = null;
// Some entries contain double spaces in the date string (which would break our regexs below)
String dateStringWithoutDoubleSpaces = dateStringFromWebsite.replaceAll("\\s\\s+", " ");

Matcher dateMatcherAfter2000 = DATE_FROM_WEBSITE_AFTER_2000_PATTERN.matcher(dateStringWithoutDoubleSpaces.trim());
if (dateMatcherAfter2000.find()) {
try {
date = DATE_FORMAT_WEBSITE_AFTER_2000.parse(dateMatcherAfter2000.group(1));
} catch (DateTimeParseException e) {
LOGGER.warn("Date from IACR could not be parsed", e);
}
}

// Entries before year 2000 use a variety of date formats - fortunately, we can match them with only two different
// date formats (each of which differ from the unified format of post-2000 entries).
Matcher dateMatcherBefore2000 = DATE_FROM_WEBSITE_BEFORE_2000_PATTERN.matcher(dateStringWithoutDoubleSpaces.trim());
if (dateMatcherBefore2000.find()) {
String dateWithoutComma = dateMatcherBefore2000.group(1).replace(",", "");
try {
date = DATE_FORMAT_WEBSITE_BEFORE_2000_LONG_MONTHS.parse(dateWithoutComma);
} catch (DateTimeParseException e) {
try {
date = DATE_FORMAT_WEBSITE_BEFORE_2000_SHORT_MONTHS.parse(dateWithoutComma);
} catch (DateTimeException e1) {
LOGGER.warn("Date from IACR could not be parsed", e);
LOGGER.warn("Date from IACR could not be parsed", e1);
}
}
}

return date;
private String getDate(String descriptiveHtml) throws FetcherException {
String startOfHistoryString = "<dt>History</dt>\n \n \n <dd>";
String dateStringAsInHtml = getRequiredValueBetween(startOfHistoryString, ":", descriptiveHtml);
return dateStringAsInHtml;
}

private String getHtml(String url) throws FetcherException {
Expand Down
1 change: 0 additions & 1 deletion src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1868,7 +1868,6 @@ Removes\ all\ hyphenated\ line\ breaks\ in\ the\ field\ content.=Removes all hyp
Could\ not\ retrieve\ entry\ data\ from\ '%0'.=Could not retrieve entry data from '%0'.
Entry\ from\ %0\ could\ not\ be\ parsed.=Entry from %0 could not be parsed.
Invalid\ identifier\:\ '%0'.=Invalid identifier: '%0'.
This\ paper\ has\ been\ withdrawn.=This paper has been withdrawn.
empty\ citation\ key=empty citation key
Aux\ file=Aux file
Group\ containing\ entries\ cited\ in\ a\ given\ TeX\ file=Group containing entries cited in a given TeX file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,36 +44,37 @@ public void setUp() {
fetcher = new IacrEprintFetcher(mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS));

abram2017 = new BibEntry(StandardEntryType.Misc)
.withCitationKey("cryptoeprint:2017:1118")
.withCitationKey("cryptoeprint:2017/1118")
.withField(StandardField.ABSTRACT, "dummy")
.withField(StandardField.AUTHOR, "Ittai Abraham and Dahlia Malkhi and Kartik Nayak and Ling Ren and Alexander Spiegelman")
.withField(StandardField.DATE, "2017-11-18")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2017/1118")
.withField(StandardField.NOTE, "\\url{https://ia.cr/2017/1118}")
.withField(StandardField.DATE, "2017-11-24")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2017/1118")
.withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2017/1118}")
.withField(StandardField.TITLE, "Solida: A Blockchain Protocol Based on Reconfigurable Byzantine Consensus")
.withField(StandardField.URL, "https://eprint.iacr.org/2017/1118/20171124:064527")
.withField(StandardField.VERSION, "20171124:064527")
.withField(StandardField.YEAR, "2017");

beierle2016 = new BibEntry(StandardEntryType.Misc)
.withCitationKey("cryptoeprint:2016:119")
.withCitationKey("cryptoeprint:2016/119")
.withField(StandardField.ABSTRACT, "dummy")
.withField(StandardField.AUTHOR, "Christof Beierle and Thorsten Kranz and Gregor Leander")
.withField(StandardField.DATE, "2017-02-17")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2016/119")
.withField(StandardField.NOTE, "\\url{https://ia.cr/2016/119}")
.withField(StandardField.DOI, "10.1007/978-3-662-53018-4_23")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2016/119")
.withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2016/119}")
.withField(StandardField.TITLE, "Lightweight Multiplication in GF(2^n) with Applications to MDS Matrices")
.withField(StandardField.URL, "https://eprint.iacr.org/2016/119/20170217:150415")
.withField(StandardField.VERSION, "20170217:150415")
.withField(StandardField.YEAR, "2016");

delgado2017 = new BibEntry(StandardEntryType.Misc)
.withCitationKey("cryptoeprint:2017:1095")
.withCitationKey("cryptoeprint:2017/1095")
.withField(StandardField.ABSTRACT, "dummy")
.withField(StandardField.AUTHOR, "Sergi Delgado-Segura and Cristina Pérez-Solà and Guillermo Navarro-Arribas and Jordi Herrera-Joancomartí")
.withField(StandardField.DATE, "2018-01-19")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Report 2017/1095")
.withField(StandardField.NOTE, "\\url{https://ia.cr/2017/1095}")
.withField(StandardField.HOWPUBLISHED, "Cryptology ePrint Archive, Paper 2017/1095")
.withField(StandardField.NOTE, "\\url{https://eprint.iacr.org/2017/1095}")
.withField(StandardField.TITLE, "Analysis of the Bitcoin UTXO set")
.withField(StandardField.URL, "https://eprint.iacr.org/2017/1095/20180119:113352")
.withField(StandardField.VERSION, "20180119:113352")
Expand Down