Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer DOI from ArXiv identifier #10449

Merged
merged 4 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
- When searching for an identifier in the "Web search", the title of the search window is now "Identifier-based Web Search". [#10391](https://github.com/JabRef/jabref/pull/10391)
- The ampersand checker now skips verbatim fields (`file`, `url`, ...). [#10419](https://github.com/JabRef/jabref/pull/10419)
- If no existing document is selected for exporting "XMP annotated pdf" JabRef will now create a new PDF file with a sample text and the metadata. [#10102](https://github.com/JabRef/jabref/issues/10102)
- We modified the DOI cleanup to infer the DOI from an ArXiV ID if it's present. [10426](https://github.com/JabRef/jabref/issues/10426)

### Fixed

Expand Down
19 changes: 16 additions & 3 deletions src/main/java/org/jabref/logic/cleanup/DoiCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@
import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.field.UnknownField;
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.model.entry.identifier.DOI;

/**
* Formats the DOI (e.g. removes http part) and also moves DOIs from note, url or ee field to the doi field.
* Formats the DOI (e.g. removes http part) and also infers DOIs from the note, url, eprint or ee fields.
*/
public class DoiCleanup implements CleanupJob {

/**
* Fields to check for DOIs.
*/
private static final List<Field> FIELDS = Arrays.asList(StandardField.NOTE, StandardField.URL, new UnknownField("ee"));
private static final List<Field> FIELDS = Arrays.asList(StandardField.NOTE, StandardField.URL, StandardField.EPRINT,
new UnknownField("ee"));

@Override
public List<FieldChange> cleanup(BibEntry entry) {
Expand Down Expand Up @@ -57,14 +59,25 @@ public List<FieldChange> cleanup(BibEntry entry) {
} else {
// As the Doi field is empty we now check if note, url, or ee field contains a Doi
for (Field field : FIELDS) {
Optional<DOI> doi = entry.getField(field).flatMap(DOI::parse);
Optional<String> fieldContentOpt = entry.getField(field);

Optional<DOI> doi = fieldContentOpt.flatMap(DOI::parse);

if (doi.isPresent()) {
// Update Doi
Optional<FieldChange> change = entry.setField(StandardField.DOI, doi.get().getDOI());
change.ifPresent(changes::add);
removeFieldValue(entry, field, changes);
}

if (StandardField.EPRINT == field) {
fieldContentOpt.flatMap(ArXivIdentifier::parse)
.flatMap(ArXivIdentifier::inferDOI)
.ifPresent(inferredDoi -> {
Optional<FieldChange> change = entry.setField(StandardField.DOI, inferredDoi.getDOI());
change.ifPresent(changes::add);
});
}
}
}
return changes;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,14 @@

import org.jabref.model.strings.StringUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Identifier for the arXiv. See https://arxiv.org/help/arxiv_identifier
*/
public class ArXivIdentifier extends EprintIdentifier {
private static final Logger LOGGER = LoggerFactory.getLogger(ArXivIdentifier.class);

private static final String ARXIV_PREFIX = "http(s)?://arxiv.org/(abs|pdf)/|arxiv|arXiv";
private final String identifier;
Expand Down Expand Up @@ -71,6 +75,22 @@ public Optional<String> getClassification() {
}
}

/**
* ArXiV articles are assigned DOIs automatically, which starts with a DOI prefix '10.48550/' followed by the ArXiV
* ID (replacing the colon with a period).
*<p>
* For more information:
* <a href="https://blog.arxiv.org/2022/02/17/new-arxiv-articles-are-now-automatically-assigned-dois/">
* new-arxiv-articles-are-now-automatically-assigned-dois</a>
* */
public Optional<DOI> inferDOI() {
if (StringUtil.isBlank(identifier)) {
return Optional.empty();
}

return DOI.parse("10.48550/arxiv." + identifier);
}

@Override
public String toString() {
return "ArXivIdentifier{" +
Expand Down