Skip to content

Commit

Permalink
Add ArXiv identifier batch lookup (#2710)
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiasdiez authored Apr 4, 2017
1 parent 8999411 commit b714206
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 58 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- The `Move linked files to default file directory`-Cleanup operation respects the `File directory pattern` setting
- We separated the `Move file` and `Rename Pdfs` logic and context menu entries in the `General`-Tab for the Field `file` to improve the semantics
- A scrollbar was added to the cleanup panel, as a result of issue [#2501](https://github.com/JabRef/jabref/issues/2501)
- Using "Look up document identifier" in the quality menu, it is possible to look up DOIs and other identifiers for multiple entries.
- Using "Look up document identifier" in the quality menu, it is possible to look up DOIs, ArXiv ids and other identifiers for multiple entries.
- F4 opens selected file in current JTable context not just from selected entry inside the main table [#2355](https://github.com/JabRef/jabref/issues/2355)
- We added an option to copy the title of BibTeX entries to the clipboard through `Edit -> Copy title` (implements [#210](https://github.com/koppor/jabref/issues/210))
- Several scrollbars were added to the preference dialog which show up when content is too large [#2559](https://github.com/JabRef/jabref/issues/2559)
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jabref/gui/JabRefFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ private void fillMenu() {
quality.add(findUnlinkedFiles);
quality.add(autoLinkFile);

for (IdFetcher fetcher : WebFetchers.getIdFetchers()) {
for (IdFetcher fetcher : WebFetchers.getIdFetchers(Globals.prefs.getImportFormatPreferences())) {
lookupIdentifiers.add(new LookupIdentifierAction(this, fetcher));
}
quality.add(lookupIdentifiers);
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ public static List<EntryBasedFetcher> getEntryBasedFetchers(ImportFormatPreferen
return list;
}

public static List<IdFetcher> getIdFetchers() {
public static List<IdFetcher> getIdFetchers(ImportFormatPreferences importFormatPreferences) {
ArrayList<IdFetcher> list = new ArrayList<>();
list.add(new CrossRef());
list.add(new ArXiv(importFormatPreferences));
list.sort(Comparator.comparing(WebFetcher::getName));
return list;
}
Expand Down
120 changes: 76 additions & 44 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.IdBasedFetcher;
import org.jabref.logic.importer.IdFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.util.OAI2Handler;
Expand All @@ -32,6 +33,7 @@
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.model.entry.identifier.DOI;
import org.jabref.model.strings.StringUtil;
import org.jabref.model.util.OptionalUtil;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
Expand All @@ -51,7 +53,7 @@
* <a href="https://github.com/nathangrigg/arxiv2bib">arxiv2bib</a> which is <a href="https://arxiv2bibtex.org/">live</a>
* <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a>
*/
public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher {
public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {
private static final Log LOGGER = LogFactory.getLog(ArXiv.class);

private static final String API_URL = "http://export.arxiv.org/api/query";
Expand All @@ -65,51 +67,23 @@ public ArXiv(ImportFormatPreferences importFormatPreferences) {
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
Objects.requireNonNull(entry);
Optional<URL> pdfUrl = Optional.empty();

// 1. Eprint
Optional<String> identifier = entry.getField(FieldName.EPRINT);
if (StringUtil.isNotBlank(identifier)) {
try {
// Get pdf of entry with the specified id
pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl);
if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
return pdfUrl;
}
} catch (FetcherException e) {
LOGGER.warn("arXiv eprint API request failed", e);
}
}

// 2. DOI
Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::build);
if (doi.isPresent()) {
String doiString = doi.get().getDOI();
// Search for an entry in the ArXiv which is linked to the doi
try {
Optional<ArXivEntry> arxivEntry = searchForEntry("doi:" + doiString);

if (arxivEntry.isPresent()) {
// Check if entry is a match
StringSimilarity match = new StringSimilarity();
String arxivTitle = arxivEntry.get().title.orElse("");
String entryTitle = entry.getField(FieldName.TITLE).orElse("");

if (match.isSimilar(arxivTitle, entryTitle)) {
pdfUrl = arxivEntry.get().getPdfUrl();
if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
return pdfUrl;
}
}
}
} catch (FetcherException e) {
LOGGER.warn("arXiv DOI API request failed", e);
try {
Optional<URL> pdfUrl = searchForEntries(entry).stream()
.map(ArXivEntry::getPdfUrl)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();

if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
}
return pdfUrl;
} catch (FetcherException e) {
LOGGER.warn("arXiv API request failed", e);
}

return pdfUrl;
return Optional.empty();
}

private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherException {
Expand All @@ -135,6 +109,47 @@ private Optional<ArXivEntry> searchForEntryById(String id) throws FetcherExcepti
}
}

private List<ArXivEntry> searchForEntries(BibEntry entry) throws FetcherException {
// 1. Eprint
Optional<String> identifier = entry.getField(FieldName.EPRINT);
if (StringUtil.isNotBlank(identifier)) {
try {
// Get pdf of entry with the specified id
return OptionalUtil.toList(searchForEntryById(identifier.get()));
} catch (FetcherException e) {
LOGGER.warn("arXiv eprint API request failed", e);
}
}

// 2. DOI and other fields
String query;

Optional<String> doi = entry.getField(FieldName.DOI).flatMap(DOI::build).map(DOI::getNormalized);
if (doi.isPresent()) {
// Search for an entry in the ArXiv which is linked to the doi
query = "doi:" + doi.get();
} else {
Optional<String> authorQuery = entry.getField(FieldName.AUTHOR).map(author -> "au:" + author);
Optional<String> titleQuery = entry.getField(FieldName.TITLE).map(title -> "ti:" + title);
query = OptionalUtil.toList(authorQuery, titleQuery).stream().collect(Collectors.joining("+AND+"));
}

Optional<ArXivEntry> arxivEntry = searchForEntry(query);

if (arxivEntry.isPresent()) {
// Check if entry is a match
StringSimilarity match = new StringSimilarity();
String arxivTitle = arxivEntry.get().title.orElse("");
String entryTitle = entry.getField(FieldName.TITLE).orElse("");

if (match.isSimilar(arxivTitle, entryTitle)) {
return OptionalUtil.toList(arxivEntry);
}
}

return Collections.emptyList();
}

private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherException {
return queryApi(searchQuery, Collections.emptyList(), 0, 10);
}
Expand Down Expand Up @@ -242,6 +257,19 @@ public Optional<BibEntry> performSearchById(String identifier) throws FetcherExc
(arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
}

@Override
public Optional<ArXivIdentifier> findIdentifier(BibEntry entry) throws FetcherException {
return searchForEntries(entry).stream()
.map(ArXivEntry::getId)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
}

@Override
public String getIdentifierName() {
return "ArXiv";
}

private static class ArXivEntry {

Expand Down Expand Up @@ -326,7 +354,7 @@ public Optional<URL> getPdfUrl() {
/**
* Returns the arXiv identifier
*/
public Optional<String> getId() {
public Optional<String> getIdString() {
// remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID
String prefix = "http://arxiv.org/abs/";
return urlAbstractPage.map(abstractUrl -> {
Expand All @@ -338,6 +366,10 @@ public Optional<String> getId() {
});
}

public Optional<ArXivIdentifier> getId() {
return getIdString().flatMap(ArXivIdentifier::parse);
}

/**
* Returns the date when the first version was put on the arXiv
*/
Expand All @@ -358,7 +390,7 @@ public BibEntry toBibEntry(Character keywordDelimiter) {
bibEntry.setField(FieldName.EPRINTTYPE, "arXiv");
bibEntry.setField(FieldName.AUTHOR, String.join(" and ", authorNames));
bibEntry.addKeywords(categories, keywordDelimiter);
getId().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id));
getIdString().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id));
title.ifPresent(titleContent -> bibEntry.setField(FieldName.TITLE, titleContent));
doi.ifPresent(doiContent -> bibEntry.setField(FieldName.DOI, doiContent));
abstractText.ifPresent(abstractContent -> bibEntry.setField(FieldName.ABSTRACT, abstractContent));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import java.util.Objects;
import java.util.Optional;

public class ArXivIdentifier {
import org.jabref.model.entry.FieldName;

public class ArXivIdentifier implements Identifier {

private final String identifier;

Expand Down Expand Up @@ -31,6 +33,12 @@ public int hashCode() {
return identifier.hashCode();
}

@Override
public String getDefaultField() {
return FieldName.EPRINT;
}

@Override
public String getNormalized() {
return identifier;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public void getSearchBasedFetchersReturnsAllFetcherDerivingFromSearchBasedFetche

@Test
public void getIdFetchersReturnsAllFetcherDerivingFromIdFetcher() throws Exception {
List<IdFetcher> idFetchers = WebFetchers.getIdFetchers();
List<IdFetcher> idFetchers = WebFetchers.getIdFetchers(importFormatPreferences);

Set<Class<? extends IdFetcher>> expected = reflections.getSubTypesOf(IdFetcher.class);
expected.remove(IdParserFetcher.class);
Expand Down
42 changes: 33 additions & 9 deletions src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BiblatexEntryTypes;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.testutils.category.FetcherTests;

import org.junit.Assert;
Expand Down Expand Up @@ -52,61 +53,77 @@ public void setUp() {
}

@Test
public void noIdentifierPresent() throws IOException {
public void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException {
assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test(expected = NullPointerException.class)
public void rejectNullParameter() throws IOException {
public void findFullTextRejectsNullParameter() throws IOException {
finder.findFullText(null);
Assert.fail();
}

@Test
public void findByDOI() throws IOException {
public void findFullTextByDOI() throws IOException {
entry.setField(FieldName.DOI, "10.1529/biophysj.104.047340");
entry.setField(FieldName.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));

}

@Test
public void findByEprint() throws IOException {
public void findFullTextByEprint() throws IOException {
entry.setField("eprint", "1603.06570");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findByEprintWithPrefix() throws IOException {
public void findFullTextByEprintWithPrefix() throws IOException {
entry.setField("eprint", "arXiv:1603.06570");
assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void findByEprintWithUnknownDOI() throws IOException {
public void findFullTextByEprintWithUnknownDOI() throws IOException {
entry.setField("doi", "10.1529/unknown");
entry.setField("eprint", "1603.06570");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
}

@Test
public void notFoundByUnknownDOI() throws IOException {
public void findFullTextByTitle() throws IOException {
entry.setField("title", "Pause Point Spectra in DNA Constant-Force Unzipping");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void findFullTextByTitleAndPartOfAuthor() throws IOException {
entry.setField("title", "Pause Point Spectra in DNA Constant-Force Unzipping");
entry.setField("author", "Weeks and Lucks");

assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
}

@Test
public void notFindFullTextByUnknownDOI() throws IOException {
entry.setField("doi", "10.1529/unknown");

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void notFoundByUnknownId() throws IOException {
public void notFindFullTextByUnknownId() throws IOException {
entry.setField("eprint", "1234.12345");

assertEquals(Optional.empty(), finder.findFullText(entry));
}

@Test
public void findByDOINotAvailableInCatalog() throws IOException {
public void findFullTextByDOINotAvailableInCatalog() throws IOException {
entry.setField(FieldName.DOI, "10.1016/0370-2693(77)90015-6");
entry.setField(FieldName.TITLE, "Superspace formulation of supergravity");

Expand Down Expand Up @@ -173,4 +190,11 @@ public void searchWithMalformedIdThrowsException() throws Exception {
expectedException.expectMessage("incorrect id format");
finder.performSearchById("123412345");
}

@Test
public void searchIdentifierForSlicePaper() throws Exception {
sliceTheoremPaper.clearField(FieldName.EPRINT);

assertEquals(ArXivIdentifier.parse("1405.2249v1"), finder.findIdentifier(sliceTheoremPaper));
}
}

0 comments on commit b714206

Please sign in to comment.