Skip to content

Commit

Permalink
match both https and http prefixes in arxiv
Browse files Browse the repository at this point in the history
  • Loading branch information
Siedlerchr committed Jul 18, 2018
1 parent 6290a41 commit 39e8b33
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 24 deletions.
44 changes: 25 additions & 19 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.xml.parsers.DocumentBuilder;
Expand Down Expand Up @@ -53,9 +55,13 @@
* <a herf="https://gitlab.c3sl.ufpr.br/portalmec/dspace-portalmec/blob/aa209d15082a9870f9daac42c78a35490ce77b52/dspace-api/src/main/java/org/dspace/submit/lookup/ArXivService.java">dspace-portalmec</a>
*/
public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetcher, IdFetcher<ArXivIdentifier> {

private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class);

private static final String API_URL = "https://export.arxiv.org/api/query";
// remove leading http(s)://arxiv.org/abs/ from abstract url to get arXiv ID
private static final String ARXIV_URL_PREFIX_FOR_ID = "(https?://arxiv.org/abs/)";
private static final Pattern URL_PATTERN = Pattern.compile(ARXIV_URL_PREFIX_FOR_ID);

private final ImportFormatPreferences importFormatPreferences;

Expand All @@ -69,10 +75,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException {

try {
Optional<URL> pdfUrl = searchForEntries(entry).stream()
.map(ArXivEntry::getPdfUrl)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
.map(ArXivEntry::getPdfUrl)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();

if (pdfUrl.isPresent()) {
LOGGER.info("Fulltext PDF found @ arXiv.");
Expand Down Expand Up @@ -159,7 +165,7 @@ private List<ArXivEntry> searchForEntries(String searchQuery) throws FetcherExce
}

private List<ArXivEntry> queryApi(String searchQuery, List<ArXivIdentifier> ids, int start, int maxResults)
throws FetcherException {
throws FetcherException {
Document result = callApi(searchQuery, ids, start, maxResults);
List<Node> entries = XMLUtil.asList(result.getElementsByTagName("entry"));

Expand Down Expand Up @@ -195,7 +201,7 @@ private Document callApi(String searchQuery, List<ArXivIdentifier> ids, int star
}
if (!ids.isEmpty()) {
uriBuilder.addParameter("id_list",
ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(",")));
ids.stream().map(ArXivIdentifier::getNormalized).collect(Collectors.joining(",")));
}
uriBuilder.addParameter("start", String.valueOf(start));
uriBuilder.addParameter("max_results", String.valueOf(maxResults));
Expand Down Expand Up @@ -252,7 +258,8 @@ public HelpFile getHelpPage() {
@Override
public List<BibEntry> performSearch(String query) throws FetcherException {
return searchForEntries(query).stream().map(
(arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())).collect(Collectors.toList());
(arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()))
.collect(Collectors.toList());
}

@Override
Expand All @@ -266,10 +273,10 @@ public Optional<BibEntry> performSearchById(String identifier) throws FetcherExc
@Override
public Optional<ArXivIdentifier> findIdentifier(BibEntry entry) throws FetcherException {
return searchForEntries(entry).stream()
.map(ArXivEntry::getId)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
.map(ArXivEntry::getId)
.filter(Optional::isPresent)
.map(Optional::get)
.findFirst();
}

@Override
Expand All @@ -290,7 +297,6 @@ private static class ArXivEntry {
private final Optional<String> journalReferenceText;
private final Optional<String> primaryCategory;


public ArXivEntry(Node item) {
// see https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned

Expand Down Expand Up @@ -347,7 +353,7 @@ public ArXivEntry(Node item) {
// Primary category
// Ex: <arxiv:primary_category xmlns:arxiv="https://arxiv.org/schemas/atom" term="math-ph" scheme="http://arxiv.org/schemas/atom"/>
primaryCategory = XMLUtil.getNode(item, "arxiv:primary_category")
.flatMap(node -> XMLUtil.getAttributeContent(node, "term"));
.flatMap(node -> XMLUtil.getAttributeContent(node, "term"));
}

public static String correctLineBreaks(String s) {
Expand All @@ -367,14 +373,15 @@ public Optional<URL> getPdfUrl() {
* Returns the arXiv identifier
*/
public Optional<String> getIdString() {
// remove leading http://arxiv.org/abs/ from abstract url to get arXiv ID
String prefix = "http://arxiv.org/abs/";

return urlAbstractPage.map(abstractUrl -> {
if (abstractUrl.startsWith(prefix)) {
return abstractUrl.substring(prefix.length());
Matcher matcher = URL_PATTERN.matcher(abstractUrl);
if (matcher.find()) {
return abstractUrl.substring(matcher.group(1).length());
} else {
return abstractUrl;
}

});
}

Expand Down Expand Up @@ -409,8 +416,7 @@ public BibEntry toBibEntry(Character keywordDelimiter) {
getDate().ifPresent(date -> bibEntry.setField(FieldName.DATE, date));
primaryCategory.ifPresent(category -> bibEntry.setField(FieldName.EPRINTCLASS, category));
journalReferenceText.ifPresent(journal -> bibEntry.setField(FieldName.JOURNALTITLE, journal));
getPdfUrl().ifPresent(url -> bibEntry
.setFiles(Collections.singletonList(new LinkedFile(url, "PDF"))));
getPdfUrl().ifPresent(url -> bibEntry.setFiles(Collections.singletonList(new LinkedFile(url, "PDF"))));
return bibEntry;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,21 @@ public class DBLPFetcherTest {
public void setUp() {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class);
when(importFormatPreferences.getFieldContentParserPreferences())
.thenReturn(mock(FieldContentParserPreferences.class));
.thenReturn(mock(FieldContentParserPreferences.class));
dblpFetcher = new DBLPFetcher(importFormatPreferences);
entry = new BibEntry();

entry.setType(BibtexEntryTypes.ARTICLE.getName());
entry.setCiteKey("DBLP:journals/stt/GeigerHL16");
entry.setField(FieldName.TITLE,
"Process Engine Benchmarking with Betsy in the Context of {ISO/IEC} Quality Standards");
"Process Engine Benchmarking with Betsy in the Context of {ISO/IEC} Quality Standards");
entry.setField(FieldName.AUTHOR, "Matthias Geiger and Simon Harrer and J{\\\"{o}}rg Lenhard");
entry.setField(FieldName.JOURNAL, "Softwaretechnik-Trends");
entry.setField(FieldName.VOLUME, "36");
entry.setField(FieldName.NUMBER, "2");
entry.setField(FieldName.YEAR, "2016");
entry.setField(FieldName.URL,
"http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf");
"http://pi.informatik.uni-siegen.de/stt/36_2/./03_Technische_Beitraege/ZEUS2016/beitrag_2.pdf");
entry.setField("biburl", "https://dblp.org/rec/bib/journals/stt/GeigerHL16");
entry.setField("bibsource", "dblp computer science bibliography, https://dblp.org");

Expand All @@ -65,8 +65,7 @@ public void findSingleEntryUsingComplexOperators() throws FetcherException {
}

@Test
public void findNothing() throws Exception
{
public void findNothing() throws Exception {
assertEquals(Collections.emptyList(), dblpFetcher.performSearch(""));
}

Expand Down

0 comments on commit 39e8b33

Please sign in to comment.