match both https and http prefixes in arxiv

JabRef · Jul 18, 2018 · 39e8b33 · 39e8b33
1 parent 6290a41
commit 39e8b33
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 24 deletions.
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
@@ -10,6 +10,8 @@
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import javax.xml.parsers.DocumentBuilder;
@@ -53,9 +55,13 @@
  * <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a>
  */
 public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {
+
     private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class);
 
     private static final String API_URL = "https://export.arxiv.org/api/query";
+    // remove leading http(s)://arxiv.org/abs/ from abstract url to get arXiv ID
+    private static final String ARXIV_URL_PREFIX_FOR_ID = "(https?://arxiv.org/abs/)";
+    private static final Pattern URL_PATTERN = Pattern.compile(ARXIV_URL_PREFIX_FOR_ID);
 
     private final ImportFormatPreferences importFormatPreferences;
 
@@ -69,10 +75,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {
 
         try {
             Optional<URL> pdfUrl = searchForEntries(entry).stream()
-                    .map(ArXivEntry::getPdfUrl)
-                    .filter(Optional::isPresent)
-                    .map(Optional::get)
-                    .findFirst();
+                                                          .map(ArXivEntry::getPdfUrl)
+                                                          .filter(Optional::isPresent)
+                                                          .map(Optional::get)
+                                                          .findFirst();
 
             if (pdfUrl.isPresent()) {
                 LOGGER.info("Fulltext PDF found @ arXiv.");
@@ -159,7 +165,7 @@ private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherExce
     }
 
     private List<ArXivEntry> queryApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults)
-            throws FetcherException {
+        throws FetcherException {
         Document result = callApi(searchQuery, ids, start, maxResults);
         List<Node> entries = XMLUtil.asList(result.getElementsByTagName("entry"));
 
@@ -195,7 +201,7 @@ private Document callApi(String searchQuery, List<ArXivIdentifier> ids, int star
             }
             if (!ids.isEmpty()) {
                 uriBuilder.addParameter("id_list",
-                        ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(",")));
+                                        ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(",")));
             }
             uriBuilder.addParameter("start", String.valueOf(start));
             uriBuilder.addParameter("max_results", String.valueOf(maxResults));
@@ -252,7 +258,8 @@ public HelpFile getHelpPage() {
     @Override
     public List<BibEntry> performSearch(String query) throws FetcherException {
         return searchForEntries(query).stream().map(
-                (arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())).collect(Collectors.toList());
+                                                    (arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()))
+                                      .collect(Collectors.toList());
     }
 
     @Override
@@ -266,10 +273,10 @@ public Optional<BibEntry> performSearchById(String identifier) throws FetcherExc
     @Override
     public Optional<ArXivIdentifier> findIdentifier(BibEntry entry) throws FetcherException {
         return searchForEntries(entry).stream()
-                .map(ArXivEntry::getId)
-                .filter(Optional::isPresent)
-                .map(Optional::get)
-                .findFirst();
+                                      .map(ArXivEntry::getId)
+                                      .filter(Optional::isPresent)
+                                      .map(Optional::get)
+                                      .findFirst();
     }
 
     @Override
@@ -290,7 +297,6 @@ private static class ArXivEntry {
         private final Optional<String> journalReferenceText;
         private final Optional<String> primaryCategory;
 
-
         public ArXivEntry(Node item) {
             // see https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned
 
@@ -347,7 +353,7 @@ public ArXivEntry(Node item) {
             // Primary category
             // Ex: <arxiv:primary_category xmlns:arxiv="https://arxiv.org/schemas/atom" term="math-ph" scheme="http://arxiv.org/schemas/atom"/>
             primaryCategory = XMLUtil.getNode(item, "arxiv:primary_category")
-                    .flatMap(node -> XMLUtil.getAttributeContent(node, "term"));
+                                     .flatMap(node -> XMLUtil.getAttributeContent(node, "term"));
         }
 
         public static String correctLineBreaks(String s) {
@@ -367,14 +373,15 @@ public Optional<URL> getPdfUrl() {
          * Returns the arXiv identifier
          */
         public Optional<String> getIdString() {
-            // remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID
-            String prefix = "http://arxiv.org/abs/";
+
             return urlAbstractPage.map(abstractUrl -> {
-                if (abstractUrl.startsWith(prefix)) {
-                    return abstractUrl.substring(prefix.length());
+                Matcher matcher = URL_PATTERN.matcher(abstractUrl);
+                if (matcher.find()) {
+                    return abstractUrl.substring(matcher.group(1).length());
                 } else {
                     return abstractUrl;
                 }
+
             });
         }
 
@@ -409,8 +416,7 @@ public BibEntry toBibEntry(Character keywordDelimiter) {
             getDate().ifPresent(date -> bibEntry.setField(FieldName.DATE, date));
             primaryCategory.ifPresent(category -> bibEntry.setField(FieldName.EPRINTCLASS, category));
             journalReferenceText.ifPresent(journal -> bibEntry.setField(FieldName.JOURNALTITLE, journal));
-            getPdfUrl().ifPresent(url -> bibEntry
-                    .setFiles(Collections.singletonList(new LinkedFile(url, "PDF"))));
+            getPdfUrl().ifPresent(url -> bibEntry.setFiles(Collections.singletonList(new LinkedFile(url, "PDF"))));
             return bibEntry;
         }
     }

diff --git a/src/test/java/org/jabref/logic/importer/fetcher/DBLPFetcherTest.java b/src/test/java/org/jabref/logic/importer/fetcher/DBLPFetcherTest.java
@@ -28,21 +28,21 @@ public class DBLPFetcherTest {
     public void setUp() {
         ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class);
         when(importFormatPreferences.getFieldContentParserPreferences())
-                .thenReturn(mock(FieldContentParserPreferences.class));
+                                                                        .thenReturn(mock(FieldContentParserPreferences.class));
         dblpFetcher = new DBLPFetcher(importFormatPreferences);
         entry = new BibEntry();
 
         entry.setType(BibtexEntryTypes.ARTICLE.getName());
         entry.setCiteKey("DBLP:journals/stt/GeigerHL16");
         entry.setField(FieldName.TITLE,
-                "Process Engine Benchmarking with Betsy in the Context of {ISO/IEC} Quality Standards");
+                       "Process Engine Benchmarking with Betsy in the Context of {ISO/IEC} Quality Standards");
         entry.setField(FieldName.AUTHOR, "Matthias Geiger and Simon Harrer and J{\\\"{o}}rg Lenhard");
         entry.setField(FieldName.JOURNAL, "Softwaretechnik-Trends");
         entry.setField(FieldName.VOLUME, "36");
         entry.setField(FieldName.NUMBER, "2");
         entry.setField(FieldName.YEAR, "2016");
         entry.setField(FieldName.URL,
-                "http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf");
+                       "http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf");
         entry.setField("biburl", "https://dblp.org/rec/bib/journals/stt/GeigerHL16");
         entry.setField("bibsource", "dblp computer science bibliography, https://dblp.org");
 
@@ -65,8 +65,7 @@ public void findSingleEntryUsingComplexOperators() throws FetcherException {
     }
 
     @Test
-    public void findNothing() throws Exception
-    {
+    public void findNothing() throws Exception {
         assertEquals(Collections.emptyList(), dblpFetcher.performSearch(""));
     }