From b714206282e064ef4e839b35a8063c165aff9570 Mon Sep 17 00:00:00 2001 From: Tobias Diez Date: Tue, 4 Apr 2017 22:21:53 +0200 Subject: [PATCH] Add ArXiv identifier batch lookup (#2710) --- CHANGELOG.md | 2 +- src/main/java/org/jabref/gui/JabRefFrame.java | 2 +- .../jabref/logic/importer/WebFetchers.java | 3 +- .../jabref/logic/importer/fetcher/ArXiv.java | 120 +++++++++++------- .../entry/identifier/ArXivIdentifier.java | 10 +- .../logic/importer/WebFetchersTest.java | 2 +- .../logic/importer/fetcher/ArXivTest.java | 42 ++++-- 7 files changed, 123 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 939201c7de7..72c639ea0e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,7 +56,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `# - The `Move linked files to default file directory`-Cleanup operation respects the `File directory pattern` setting - We separated the `Move file` and `Rename Pdfs` logic and context menu entries in the `General`-Tab for the Field `file` to improve the semantics - A scrollbar was added to the cleanup panel, as a result of issue [#2501](https://github.com/JabRef/jabref/issues/2501) -- Using "Look up document identifier" in the quality menu, it is possible to look up DOIs and other identifiers for multiple entries. +- Using "Look up document identifier" in the quality menu, it is possible to look up DOIs, ArXiv ids and other identifiers for multiple entries. - F4 opens selected file in current JTable context not just from selected entry inside the main table [#2355](https://github.com/JabRef/jabref/issues/2355) - We added an option to copy the title of BibTeX entries to the clipboard through `Edit -> Copy title` (implements [#210](https://github.com/koppor/jabref/issues/210)) - Several scrollbars were added to the preference dialog which show up when content is too large [#2559](https://github.com/JabRef/jabref/issues/2559) diff --git a/src/main/java/org/jabref/gui/JabRefFrame.java b/src/main/java/org/jabref/gui/JabRefFrame.java index 998c692360a..95494891112 100644 --- a/src/main/java/org/jabref/gui/JabRefFrame.java +++ b/src/main/java/org/jabref/gui/JabRefFrame.java @@ -1177,7 +1177,7 @@ private void fillMenu() { quality.add(findUnlinkedFiles); quality.add(autoLinkFile); - for (IdFetcher fetcher : WebFetchers.getIdFetchers()) { + for (IdFetcher fetcher : WebFetchers.getIdFetchers(Globals.prefs.getImportFormatPreferences())) { lookupIdentifiers.add(new LookupIdentifierAction(this, fetcher)); } quality.add(lookupIdentifiers); diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java index 5e4c3903531..3f17abbaee4 100644 --- a/src/main/java/org/jabref/logic/importer/WebFetchers.java +++ b/src/main/java/org/jabref/logic/importer/WebFetchers.java @@ -94,9 +94,10 @@ public static List getEntryBasedFetchers(ImportFormatPreferen return list; } - public static List getIdFetchers() { + public static List getIdFetchers(ImportFormatPreferences importFormatPreferences) { ArrayList list = new ArrayList<>(); list.add(new CrossRef()); + list.add(new ArXiv(importFormatPreferences)); list.sort(Comparator.comparing(WebFetcher::getName)); return list; } diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java index 5575d0f1602..af617d5c289 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java @@ -20,6 +20,7 @@ import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.FulltextFetcher; import org.jabref.logic.importer.IdBasedFetcher; +import org.jabref.logic.importer.IdFetcher; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.SearchBasedFetcher; import org.jabref.logic.importer.util.OAI2Handler; @@ -32,6 +33,7 @@ import org.jabref.model.entry.identifier.ArXivIdentifier; import org.jabref.model.entry.identifier.DOI; import org.jabref.model.strings.StringUtil; +import org.jabref.model.util.OptionalUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -51,7 +53,7 @@ * arxiv2bib which is live * dspace-portalmec */ -public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher { +public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher { private static final Log LOGGER = LogFactory.getLog(ArXiv.class); private static final String API_URL = "http://export.arxiv.org/api/query"; @@ -65,51 +67,23 @@ public ArXiv(ImportFormatPreferences importFormatPreferences) { @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - Optional pdfUrl = Optional.empty(); - // 1. Eprint - Optional identifier = entry.getField(FieldName.EPRINT); - if (StringUtil.isNotBlank(identifier)) { - try { - // Get pdf of entry with the specified id - pdfUrl = searchForEntryById(identifier.get()).flatMap(ArXivEntry::getPdfUrl); - if (pdfUrl.isPresent()) { - LOGGER.info("Fulltext PDF found @ arXiv."); - return pdfUrl; - } - } catch (FetcherException e) { - LOGGER.warn("arXiv eprint API request failed", e); - } - } - - // 2. DOI - Optional doi = entry.getField(FieldName.DOI).flatMap(DOI::build); - if (doi.isPresent()) { - String doiString = doi.get().getDOI(); - // Search for an entry in the ArXiv which is linked to the doi - try { - Optional arxivEntry = searchForEntry("doi:" + doiString); - - if (arxivEntry.isPresent()) { - // Check if entry is a match - StringSimilarity match = new StringSimilarity(); - String arxivTitle = arxivEntry.get().title.orElse(""); - String entryTitle = entry.getField(FieldName.TITLE).orElse(""); - - if (match.isSimilar(arxivTitle, entryTitle)) { - pdfUrl = arxivEntry.get().getPdfUrl(); - if (pdfUrl.isPresent()) { - LOGGER.info("Fulltext PDF found @ arXiv."); - return pdfUrl; - } - } - } - } catch (FetcherException e) { - LOGGER.warn("arXiv DOI API request failed", e); + try { + Optional pdfUrl = searchForEntries(entry).stream() + .map(ArXivEntry::getPdfUrl) + .filter(Optional::isPresent) + .map(Optional::get) + .findFirst(); + + if (pdfUrl.isPresent()) { + LOGGER.info("Fulltext PDF found @ arXiv."); } + return pdfUrl; + } catch (FetcherException e) { + LOGGER.warn("arXiv API request failed", e); } - return pdfUrl; + return Optional.empty(); } private Optional searchForEntry(String searchQuery) throws FetcherException { @@ -135,6 +109,47 @@ private Optional searchForEntryById(String id) throws FetcherExcepti } } + private List searchForEntries(BibEntry entry) throws FetcherException { + // 1. Eprint + Optional identifier = entry.getField(FieldName.EPRINT); + if (StringUtil.isNotBlank(identifier)) { + try { + // Get pdf of entry with the specified id + return OptionalUtil.toList(searchForEntryById(identifier.get())); + } catch (FetcherException e) { + LOGGER.warn("arXiv eprint API request failed", e); + } + } + + // 2. DOI and other fields + String query; + + Optional doi = entry.getField(FieldName.DOI).flatMap(DOI::build).map(DOI::getNormalized); + if (doi.isPresent()) { + // Search for an entry in the ArXiv which is linked to the doi + query = "doi:" + doi.get(); + } else { + Optional authorQuery = entry.getField(FieldName.AUTHOR).map(author -> "au:" + author); + Optional titleQuery = entry.getField(FieldName.TITLE).map(title -> "ti:" + title); + query = OptionalUtil.toList(authorQuery, titleQuery).stream().collect(Collectors.joining("+AND+")); + } + + Optional arxivEntry = searchForEntry(query); + + if (arxivEntry.isPresent()) { + // Check if entry is a match + StringSimilarity match = new StringSimilarity(); + String arxivTitle = arxivEntry.get().title.orElse(""); + String entryTitle = entry.getField(FieldName.TITLE).orElse(""); + + if (match.isSimilar(arxivTitle, entryTitle)) { + return OptionalUtil.toList(arxivEntry); + } + } + + return Collections.emptyList(); + } + private List searchForEntries(String searchQuery) throws FetcherException { return queryApi(searchQuery, Collections.emptyList(), 0, 10); } @@ -242,6 +257,19 @@ public Optional performSearchById(String identifier) throws FetcherExc (arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())); } + @Override + public Optional findIdentifier(BibEntry entry) throws FetcherException { + return searchForEntries(entry).stream() + .map(ArXivEntry::getId) + .filter(Optional::isPresent) + .map(Optional::get) + .findFirst(); + } + + @Override + public String getIdentifierName() { + return "ArXiv"; + } private static class ArXivEntry { @@ -326,7 +354,7 @@ public Optional getPdfUrl() { /** * Returns the arXiv identifier */ - public Optional getId() { + public Optional getIdString() { // remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID String prefix = "http://arxiv.org/abs/"; return urlAbstractPage.map(abstractUrl -> { @@ -338,6 +366,10 @@ public Optional getId() { }); } + public Optional getId() { + return getIdString().flatMap(ArXivIdentifier::parse); + } + /** * Returns the date when the first version was put on the arXiv */ @@ -358,7 +390,7 @@ public BibEntry toBibEntry(Character keywordDelimiter) { bibEntry.setField(FieldName.EPRINTTYPE, "arXiv"); bibEntry.setField(FieldName.AUTHOR, String.join(" and ", authorNames)); bibEntry.addKeywords(categories, keywordDelimiter); - getId().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id)); + getIdString().ifPresent(id -> bibEntry.setField(FieldName.EPRINT, id)); title.ifPresent(titleContent -> bibEntry.setField(FieldName.TITLE, titleContent)); doi.ifPresent(doiContent -> bibEntry.setField(FieldName.DOI, doiContent)); abstractText.ifPresent(abstractContent -> bibEntry.setField(FieldName.ABSTRACT, abstractContent)); diff --git a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java index 19c49bfb979..3c73f7064e5 100644 --- a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java +++ b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java @@ -3,7 +3,9 @@ import java.util.Objects; import java.util.Optional; -public class ArXivIdentifier { +import org.jabref.model.entry.FieldName; + +public class ArXivIdentifier implements Identifier { private final String identifier; @@ -31,6 +33,12 @@ public int hashCode() { return identifier.hashCode(); } + @Override + public String getDefaultField() { + return FieldName.EPRINT; + } + + @Override public String getNormalized() { return identifier; } diff --git a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java index 7f179ef5fc5..0d09defd4a1 100644 --- a/src/test/java/org/jabref/logic/importer/WebFetchersTest.java +++ b/src/test/java/org/jabref/logic/importer/WebFetchersTest.java @@ -60,7 +60,7 @@ public void getSearchBasedFetchersReturnsAllFetcherDerivingFromSearchBasedFetche @Test public void getIdFetchersReturnsAllFetcherDerivingFromIdFetcher() throws Exception { - List idFetchers = WebFetchers.getIdFetchers(); + List idFetchers = WebFetchers.getIdFetchers(importFormatPreferences); Set> expected = reflections.getSubTypesOf(IdFetcher.class); expected.remove(IdParserFetcher.class); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java index 47af5898eea..1e6a4545f6e 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java @@ -10,6 +10,7 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BiblatexEntryTypes; import org.jabref.model.entry.FieldName; +import org.jabref.model.entry.identifier.ArXivIdentifier; import org.jabref.testutils.category.FetcherTests; import org.junit.Assert; @@ -52,39 +53,40 @@ public void setUp() { } @Test - public void noIdentifierPresent() throws IOException { + public void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException { assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test(expected = NullPointerException.class) - public void rejectNullParameter() throws IOException { + public void findFullTextRejectsNullParameter() throws IOException { finder.findFullText(null); Assert.fail(); } @Test - public void findByDOI() throws IOException { + public void findFullTextByDOI() throws IOException { entry.setField(FieldName.DOI, "10.1529/biophysj.104.047340"); entry.setField(FieldName.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry)); + } @Test - public void findByEprint() throws IOException { + public void findFullTextByEprint() throws IOException { entry.setField("eprint", "1603.06570"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry)); } @Test - public void findByEprintWithPrefix() throws IOException { + public void findFullTextByEprintWithPrefix() throws IOException { entry.setField("eprint", "arXiv:1603.06570"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry)); } @Test - public void findByEprintWithUnknownDOI() throws IOException { + public void findFullTextByEprintWithUnknownDOI() throws IOException { entry.setField("doi", "10.1529/unknown"); entry.setField("eprint", "1603.06570"); @@ -92,21 +94,36 @@ public void findByEprintWithUnknownDOI() throws IOException { } @Test - public void notFoundByUnknownDOI() throws IOException { + public void findFullTextByTitle() throws IOException { + entry.setField("title", "Pause Point Spectra in DNA Constant-Force Unzipping"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry)); + } + + @Test + public void findFullTextByTitleAndPartOfAuthor() throws IOException { + entry.setField("title", "Pause Point Spectra in DNA Constant-Force Unzipping"); + entry.setField("author", "Weeks and Lucks"); + + assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry)); + } + + @Test + public void notFindFullTextByUnknownDOI() throws IOException { entry.setField("doi", "10.1529/unknown"); assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test - public void notFoundByUnknownId() throws IOException { + public void notFindFullTextByUnknownId() throws IOException { entry.setField("eprint", "1234.12345"); assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test - public void findByDOINotAvailableInCatalog() throws IOException { + public void findFullTextByDOINotAvailableInCatalog() throws IOException { entry.setField(FieldName.DOI, "10.1016/0370-2693(77)90015-6"); entry.setField(FieldName.TITLE, "Superspace formulation of supergravity"); @@ -173,4 +190,11 @@ public void searchWithMalformedIdThrowsException() throws Exception { expectedException.expectMessage("incorrect id format"); finder.performSearchById("123412345"); } + + @Test + public void searchIdentifierForSlicePaper() throws Exception { + sliceTheoremPaper.clearField(FieldName.EPRINT); + + assertEquals(ArXivIdentifier.parse("1405.2249v1"), finder.findIdentifier(sliceTheoremPaper)); + } }