Add ArXiv identifier batch lookup (#2710)

JabRef · Apr 4, 2017 · b714206 · b714206
1 parent 8999411
commit b714206
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 58 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -56,7 +56,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
 - The `Move linked files to default file directory`-Cleanup operation respects the `File directory pattern` setting 
 - We separated the `Move file` and `Rename Pdfs` logic and context menu entries in the `General`-Tab for the Field `file` to improve the semantics
 - A scrollbar was added to the cleanup panel, as a result of issue [#2501](https://github.com/JabRef/jabref/issues/2501)
-- Using "Look up document identifier" in the quality menu, it is possible to look up DOIs and other identifiers for multiple entries.
+- Using "Look up document identifier" in the quality menu, it is possible to look up DOIs, ArXiv ids and other identifiers for multiple entries.
 - F4 opens selected file in current JTable context not just from selected entry inside the main table [#2355](https://github.com/JabRef/jabref/issues/2355)
 - We added an option to copy the title of BibTeX entries to the clipboard through `Edit -> Copy title` (implements [#210](https://github.com/koppor/jabref/issues/210))
 - Several scrollbars were added to the preference dialog which show up when content is too large [#2559](https://github.com/JabRef/jabref/issues/2559)

diff --git a/src/main/java/org/jabref/gui/JabRefFrame.java b/src/main/java/org/jabref/gui/JabRefFrame.java
@@ -1177,7 +1177,7 @@ private void fillMenu() {
         quality.add(findUnlinkedFiles);
         quality.add(autoLinkFile);
 
-        for (IdFetcher fetcher : WebFetchers.getIdFetchers()) {
+        for (IdFetcher fetcher : WebFetchers.getIdFetchers(Globals.prefs.getImportFormatPreferences())) {
             lookupIdentifiers.add(new LookupIdentifierAction(this, fetcher));
         }
         quality.add(lookupIdentifiers);

diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java
@@ -94,9 +94,10 @@ public static List<EntryBasedFetcher> getEntryBasedFetchers(ImportFormatPreferen
         return list;
     }
 
-    public static List<IdFetcher> getIdFetchers() {
+    public static List<IdFetcher> getIdFetchers(ImportFormatPreferences importFormatPreferences) {
         ArrayList<IdFetcher> list = new ArrayList<>();
         list.add(new CrossRef());
+        list.add(new ArXiv(importFormatPreferences));
         list.sort(Comparator.comparing(WebFetcher::getName));
         return list;
     }

diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
@@ -20,6 +20,7 @@
 import org.jabref.logic.importer.FetcherException;
 import org.jabref.logic.importer.FulltextFetcher;
 import org.jabref.logic.importer.IdBasedFetcher;
+import org.jabref.logic.importer.IdFetcher;
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.SearchBasedFetcher;
 import org.jabref.logic.importer.util.OAI2Handler;
@@ -32,6 +33,7 @@
 import org.jabref.model.entry.identifier.ArXivIdentifier;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.strings.StringUtil;
+import org.jabref.model.util.OptionalUtil;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -51,7 +53,7 @@
  * <a href="https://github.com/nathangrigg/arxiv2bib">arxiv2bib</a> which is <a href="https://arxiv2bibtex.org/">live</a>
  * <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a>
  */
-public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher {
+public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {
     private static final Log LOGGER = LogFactory.getLog(ArXiv.class);
 
     private static final String API_URL = "http://export.arxiv.org/api/query";
@@ -65,51 +67,23 @@ public ArXiv(ImportFormatPreferences importFormatPreferences) {
     @Override
     public Optional<URL> findFullText(BibEntry entry) throws IOException {
         Objects.requireNonNull(entry);
-        Optional<URL> pdfUrl = Optional.empty();
 
-        // 1. Eprint
-        Optional<String> identifier = entry.getField(FieldName.EPRINT);
-        if (StringUtil.isNotBlank(identifier)) {
-            try {
-                // Get pdf of entry with the specified id
-                pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl);
-                if (pdfUrl.isPresent()) {
-                    LOGGER.info("Fulltext PDF found @ arXiv.");
-                    return pdfUrl;
-                }
-            } catch (FetcherException e) {
-                LOGGER.warn("arXiv eprint API request failed", e);
-            }
-        }
-
-        // 2. DOI
-        Optional<DOI> doi = entry.getField(FieldName.DOI).flatMap(DOI::build);
-        if (doi.isPresent()) {
-            String doiString = doi.get().getDOI();
-            // Search for an entry in the ArXiv which is linked to the doi
-            try {
-                Optional<ArXivEntry> arxivEntry = searchForEntry("doi:" + doiString);
-
-                if (arxivEntry.isPresent()) {
-                    // Check if entry is a match
-                    StringSimilarity match = new StringSimilarity();
-                    String arxivTitle = arxivEntry.get().title.orElse("");
-                    String entryTitle = entry.getField(FieldName.TITLE).orElse("");
-
-                    if (match.isSimilar(arxivTitle, entryTitle)) {
-                        pdfUrl = arxivEntry.get().getPdfUrl();
-                        if (pdfUrl.isPresent()) {
-                            LOGGER.info("Fulltext PDF found @ arXiv.");
-                            return pdfUrl;
-                        }
-                    }
-                }
-            } catch (FetcherException e) {
-                LOGGER.warn("arXiv DOI API request failed", e);
+        try {
+            Optional<URL> pdfUrl = searchForEntries(entry).stream()
+                    .map(ArXivEntry::getPdfUrl)
+                    .filter(Optional::isPresent)
+                    .map(Optional::get)
+                    .findFirst();
+
+            if (pdfUrl.isPresent()) {
+                LOGGER.info("Fulltext PDF found @ arXiv.");
             }
+            return pdfUrl;
+        } catch (FetcherException e) {
+            LOGGER.warn("arXiv API request failed", e);
         }
 
-        return pdfUrl;
+        return Optional.empty();
     }
 
     private Optional<ArXivEntry> searchForEntry(String searchQuery) throws FetcherException {
@@ -135,6 +109,47 @@ private Optional<ArXivEntry> searchForEntryById(String id) throws FetcherExcepti
         }
     }
 
+    private List<ArXivEntry> searchForEntries(BibEntry entry) throws FetcherException {
+        // 1. Eprint
+        Optional<String> identifier = entry.getField(FieldName.EPRINT);
+        if (StringUtil.isNotBlank(identifier)) {
+            try {
+                // Get pdf of entry with the specified id
+                return OptionalUtil.toList(searchForEntryById(identifier.get()));
+            } catch (FetcherException e) {
+                LOGGER.warn("arXiv eprint API request failed", e);
+            }
+        }
+
+        // 2. DOI and other fields
+        String query;
+
+        Optional<String> doi = entry.getField(FieldName.DOI).flatMap(DOI::build).map(DOI::getNormalized);
+        if (doi.isPresent()) {
+            // Search for an entry in the ArXiv which is linked to the doi
+            query = "doi:" + doi.get();
+        } else {
+            Optional<String> authorQuery = entry.getField(FieldName.AUTHOR).map(author -> "au:" + author);
+            Optional<String> titleQuery = entry.getField(FieldName.TITLE).map(title -> "ti:" + title);
+            query = OptionalUtil.toList(authorQuery, titleQuery).stream().collect(Collectors.joining("+AND+"));
+        }
+
+        Optional<ArXivEntry> arxivEntry = searchForEntry(query);
+
+        if (arxivEntry.isPresent()) {
+            // Check if entry is a match
+            StringSimilarity match = new StringSimilarity();
+            String arxivTitle = arxivEntry.get().title.orElse("");
+            String entryTitle = entry.getField(FieldName.TITLE).orElse("");
+
+            if (match.isSimilar(arxivTitle, entryTitle)) {
+                return OptionalUtil.toList(arxivEntry);
+            }
+        }
+
+        return Collections.emptyList();
+    }
+
     private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherException {
         return queryApi(searchQuery, Collections.emptyList(), 0, 10);
     }
@@ -242,6 +257,19 @@ public Optional<BibEntry> performSearchById(String identifier) throws FetcherExc
                 (arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()));
     }
 
+    @Override
+    public Optional<ArXivIdentifier> findIdentifier(BibEntry entry) throws FetcherException {
+        return searchForEntries(entry).stream()
+                .map(ArXivEntry::getId)
+                .filter(Optional::isPresent)
+                .map(Optional::get)
+                .findFirst();
+    }
+
+    @Override
+    public String getIdentifierName() {
+        return "ArXiv";
+    }
 
     private static class ArXivEntry {
 
@@ -326,7 +354,7 @@ public Optional<URL> getPdfUrl() {
         /**
          * Returns the arXiv identifier
          */
-        public Optional<String> getId() {
+        public Optional<String> getIdString() {
             // remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID
             String prefix = "http://arxiv.org/abs/";
             return urlAbstractPage.map(abstractUrl -> {
@@ -338,6 +366,10 @@ public Optional<String> getId() {
             });
         }
 
+        public Optional<ArXivIdentifier> getId() {
+            return getIdString().flatMap(ArXivIdentifier::parse);
+        }
+
         /**
          * Returns the date when the first version was put on the arXiv
          */
@@ -358,7 +390,7 @@ public BibEntry toBibEntry(Character keywordDelimiter) {
             bibEntry.setField(FieldName.EPRINTTYPE, "arXiv");
             bibEntry.setField(FieldName.AUTHOR, String.join(" and ", authorNames));
             bibEntry.addKeywords(categories, keywordDelimiter);
-            getId().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id));
+            getIdString().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id));
             title.ifPresent(titleContent -> bibEntry.setField(FieldName.TITLE, titleContent));
             doi.ifPresent(doiContent -> bibEntry.setField(FieldName.DOI, doiContent));
             abstractText.ifPresent(abstractContent -> bibEntry.setField(FieldName.ABSTRACT, abstractContent));

diff --git a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java
@@ -3,7 +3,9 @@
 import java.util.Objects;
 import java.util.Optional;
 
-public class ArXivIdentifier {
+import org.jabref.model.entry.FieldName;
+
+public class ArXivIdentifier implements Identifier {
 
     private final String identifier;
 
@@ -31,6 +33,12 @@ public int hashCode() {
         return identifier.hashCode();
     }
 
+    @Override
+    public String getDefaultField() {
+        return FieldName.EPRINT;
+    }
+
+    @Override
     public String getNormalized() {
         return identifier;
     }

diff --git a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java
@@ -60,7 +60,7 @@ public void getSearchBasedFetchersReturnsAllFetcherDerivingFromSearchBasedFetche
 
     @Test
     public void getIdFetchersReturnsAllFetcherDerivingFromIdFetcher() throws Exception {
-        List<IdFetcher> idFetchers = WebFetchers.getIdFetchers();
+        List<IdFetcher> idFetchers = WebFetchers.getIdFetchers(importFormatPreferences);
 
         Set<Class<? extends IdFetcher>> expected = reflections.getSubTypesOf(IdFetcher.class);
         expected.remove(IdParserFetcher.class);

diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java
@@ -10,6 +10,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.BiblatexEntryTypes;
 import org.jabref.model.entry.FieldName;
+import org.jabref.model.entry.identifier.ArXivIdentifier;
 import org.jabref.testutils.category.FetcherTests;
 
 import org.junit.Assert;
@@ -52,61 +53,77 @@ public void setUp() {
     }
 
     @Test
-    public void noIdentifierPresent() throws IOException {
+    public void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException {
         assertEquals(Optional.empty(), finder.findFullText(entry));
     }
 
     @Test(expected = NullPointerException.class)
-    public void rejectNullParameter() throws IOException {
+    public void findFullTextRejectsNullParameter() throws IOException {
         finder.findFullText(null);
         Assert.fail();
     }
 
     @Test
-    public void findByDOI() throws IOException {
+    public void findFullTextByDOI() throws IOException {
         entry.setField(FieldName.DOI, "10.1529/biophysj.104.047340");
         entry.setField(FieldName.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping");
 
         assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
+
     }
 
     @Test
-    public void findByEprint() throws IOException {
+    public void findFullTextByEprint() throws IOException {
         entry.setField("eprint", "1603.06570");
 
         assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
     }
 
     @Test
-    public void findByEprintWithPrefix() throws IOException {
+    public void findFullTextByEprintWithPrefix() throws IOException {
         entry.setField("eprint", "arXiv:1603.06570");
         assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
     }
 
     @Test
-    public void findByEprintWithUnknownDOI() throws IOException {
+    public void findFullTextByEprintWithUnknownDOI() throws IOException {
         entry.setField("doi", "10.1529/unknown");
         entry.setField("eprint", "1603.06570");
 
         assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry));
     }
 
     @Test
-    public void notFoundByUnknownDOI() throws IOException {
+    public void findFullTextByTitle() throws IOException {
+        entry.setField("title", "Pause Point Spectra in DNA Constant-Force Unzipping");
+
+        assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
+    }
+
+    @Test
+    public void findFullTextByTitleAndPartOfAuthor() throws IOException {
+        entry.setField("title", "Pause Point Spectra in DNA Constant-Force Unzipping");
+        entry.setField("author", "Weeks and Lucks");
+
+        assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry));
+    }
+
+    @Test
+    public void notFindFullTextByUnknownDOI() throws IOException {
         entry.setField("doi", "10.1529/unknown");
 
         assertEquals(Optional.empty(), finder.findFullText(entry));
     }
 
     @Test
-    public void notFoundByUnknownId() throws IOException {
+    public void notFindFullTextByUnknownId() throws IOException {
         entry.setField("eprint", "1234.12345");
 
         assertEquals(Optional.empty(), finder.findFullText(entry));
     }
 
     @Test
-    public void findByDOINotAvailableInCatalog() throws IOException {
+    public void findFullTextByDOINotAvailableInCatalog() throws IOException {
         entry.setField(FieldName.DOI, "10.1016/0370-2693(77)90015-6");
         entry.setField(FieldName.TITLE, "Superspace formulation of supergravity");
 
@@ -173,4 +190,11 @@ public void searchWithMalformedIdThrowsException() throws Exception {
         expectedException.expectMessage("incorrect id format");
         finder.performSearchById("123412345");
     }
+
+    @Test
+    public void searchIdentifierForSlicePaper() throws Exception {
+        sliceTheoremPaper.clearField(FieldName.EPRINT);
+
+        assertEquals(ArXivIdentifier.parse("1405.2249v1"), finder.findIdentifier(sliceTheoremPaper));
+    }
 }