Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check duplicate DOI #6333

Merged
merged 21 commits into from
May 15, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We added support for basic markdown in custom formatted previews [#6194](https://github.com/JabRef/jabref/issues/6194)
- We now show the number of items found and selected to import in the online search dialog. [#6248](https://github.com/JabRef/jabref/pull/6248)
- We created a new install screen for macOS. [#5759](https://github.com/JabRef/jabref/issues/5759)
- We added a new integrity check for duplicate DOIs. [koppor#339](https://github.com/koppor/jabref/issues/339)

### Changed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.List;
import java.util.Map;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.Field;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.Field;
Expand All @@ -17,7 +16,6 @@ public class BibStringChecker implements Checker {
// Detect # if it doesn't have a \ in front of it or if it starts the string
private static final Pattern UNESCAPED_HASH = Pattern.compile("(?<!\\\\)#|^#");


/**
* Checks, if there is an even number of unescaped #
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import java.util.Collections;
import java.util.List;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.InternalField;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.List;
import java.util.Optional;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.InternalField;
Expand All @@ -21,7 +20,7 @@ public List<IntegrityMessage> check(BibEntry entry) {
Optional<String> author = entry.getField(StandardField.AUTHOR);
Optional<String> title = entry.getField(StandardField.TITLE);
Optional<String> year = entry.getField(StandardField.YEAR);
if (!author.isPresent() || !title.isPresent() || !year.isPresent()) {
if (author.isEmpty() || title.isEmpty() || year.isEmpty()) {
return Collections.emptyList();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabase;
import org.jabref.model.entry.BibEntry;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import org.jabref.logic.bibtexkeypattern.BibtexKeyGenerator;
import org.jabref.logic.bibtexkeypattern.BibtexKeyPatternPreferences;
import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabaseContext;
import org.jabref.model.entry.BibEntry;
Expand All @@ -26,7 +25,7 @@ public BibtexkeyDeviationChecker(BibDatabaseContext bibDatabaseContext, BibtexKe
@Override
public List<IntegrityMessage> check(BibEntry entry) {
Optional<String> valuekey = entry.getCiteKeyOptional();
if (!valuekey.isPresent()) {
if (valuekey.isEmpty()) {
return Collections.emptyList();
}

Expand Down
10 changes: 10 additions & 0 deletions src/main/java/org/jabref/logic/integrity/Checker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package org.jabref.logic.integrity;

import java.util.List;

import org.jabref.model.entry.BibEntry;

@FunctionalInterface
public interface Checker {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to EntryChecker?

List<IntegrityMessage> check(BibEntry entry);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.jabref.logic.integrity;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import javafx.collections.ObservableList;

import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabase;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.DOI;

import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;

public class DoiDuplicationChecker implements Checker {
private final BibDatabase database;
private Map<BibEntry, List<IntegrityMessage>> errors;

public DoiDuplicationChecker(BibDatabase database) {
this.database = Objects.requireNonNull(database);
}

@Override
public List<IntegrityMessage> check(BibEntry entry) {
if (errors == null) {
errors = new HashMap<>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not directly initialize that HashMap in the field declaration?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MAybe you did not see that the later code uses the BibtexDatabase from the constructor. We decided to put the code into check - and not into the constructor - to have a lean constructor and not have high CPU usage when initializing the checker, but at the first time an entry is checked.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like premature optimization for me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you caching the errors anyway? Also in case you are worrying about performance, why do you choose to implement a solution that has O(n^2) (with n = entries). Why not implement a O(n) solution (e.g. https://stackoverflow.com/a/31341963/873661) which you call in checkDatabase below?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See in line 30 that the method is called once per entry. Is is by design of the Checker interface that there is method for checking each entry. There is no method for checking a complete database. There is also no interface for a complete database.

At the first call, the result map is filled. At all subsequent calls do not fill the map and reuse it.

We are O(n+m) where n is the size of the database and m is the number of entries with DOIs.

Copy link
Member Author

@koppor koppor Apr 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make the code more reable, I moved the call to the error check to the constructor. See fd7621d


ObservableList<BibEntry> bibEntries = database.getEntries();
BiMap<DOI, List<BibEntry>> duplicateMap = HashBiMap.create(bibEntries.size());
for (BibEntry bibEntry : bibEntries) {
bibEntry.getDOI().ifPresent(doi ->
duplicateMap.computeIfAbsent(doi, x -> new ArrayList<>()).add(bibEntry));
Copy link
Member Author

@koppor koppor Apr 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is doi, not entry. However, doi cannot be used as the variable name is already bound.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@koppor Please have a close look. The second argument of duplicateMap.computeIfAbsent refers to the value and this clearly is of type List. So then maybe name it entries

Copy link
Member Author

@koppor koppor Apr 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Siedlerchr Please explain the JavaDoc to me. The Map is FROM doi TO list. https://www.geeksforgeeks.org/hashmap-computeifabsent-method-in-java-with-examples/ explains the computeIfAbsent.

To me, it reads, that the key is passed to Function<? super K, ? extends V> remappingFunction. The key is clearly (!) a DOI not a list. -- Am I wrong?

Copy link
Member Author

@koppor koppor Apr 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make the code more reable, I called the variable absentDoi. See 6f9b148

}

duplicateMap.inverse().keySet().stream()
.filter(list -> list.size() > 1)
.flatMap(list -> list.stream())
.forEach(item -> {
IntegrityMessage errorMessage = new IntegrityMessage(Localization.lang("Unique DOI used in multiple entries"), item, StandardField.DOI);
errors.put(item, List.of(errorMessage));
});
}
return errors.getOrDefault(entry, Collections.emptyList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import org.jabref.model.entry.identifier.DOI;
import org.jabref.model.strings.StringUtil;

public class DOIValidityChecker implements ValueChecker {

public class DoiValidityChecker implements ValueChecker {
@Override
public Optional<String> checkValue(String value) {
if (StringUtil.isBlank(value)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.util.Objects;
import java.util.Set;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabase;
import org.jabref.model.entry.BibEntry;
Expand All @@ -28,14 +27,14 @@ public List<IntegrityMessage> check(BibEntry entry) {
for (Entry<Field, String> field : entry.getFieldMap().entrySet()) {
Set<FieldProperty> properties = field.getKey().getProperties();
if (properties.contains(FieldProperty.SINGLE_ENTRY_LINK)) {
if (!database.getEntryByKey(field.getValue()).isPresent()) {
if (database.getEntryByKey(field.getValue()).isEmpty()) {
result.add(new IntegrityMessage(Localization.lang("Referenced BibTeX key does not exist"), entry,
field.getKey()));
}
} else if (properties.contains(FieldProperty.MULTIPLE_ENTRY_LINK)) {
List<String> keys = new ArrayList<>(Arrays.asList(field.getValue().split(",")));
for (String key : keys) {
if (!database.getEntryByKey(key).isPresent()) {
if (database.getEntryByKey(key).isEmpty()) {
result.add(new IntegrityMessage(
Localization.lang("Referenced BibTeX key does not exist") + ": " + key, entry,
field.getKey()));
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/jabref/logic/integrity/FieldChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import org.jabref.model.entry.field.Field;
import org.jabref.model.util.OptionalUtil;

public class FieldChecker implements IntegrityCheck.Checker {
public class FieldChecker implements Checker {
protected final Field field;
private final ValueChecker checker;

Expand All @@ -21,7 +21,7 @@ public FieldChecker(Field field, ValueChecker checker) {
@Override
public List<IntegrityMessage> check(BibEntry entry) {
Optional<String> value = entry.getField(field);
if (!value.isPresent()) {
if (value.isEmpty()) {
return Collections.emptyList();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ private static Multimap<Field, ValueChecker> getAllMap(BibDatabaseContext databa
fieldCheckers.put(StandardField.BOOKTITLE, new BooktitleChecker());
fieldCheckers.put(StandardField.TITLE, new BracketChecker());
fieldCheckers.put(StandardField.TITLE, new TitleChecker(databaseContext));
fieldCheckers.put(StandardField.DOI, new DOIValidityChecker());
fieldCheckers.put(StandardField.DOI, new DoiValidityChecker());
fieldCheckers.put(StandardField.EDITION, new EditionChecker(databaseContext, allowIntegerEdition));
fieldCheckers.put(StandardField.FILE, new FileChecker(databaseContext, filePreferences));
fieldCheckers.put(StandardField.HOWPUBLISHED, new HowPublishedChecker(databaseContext));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.Field;
Expand Down
77 changes: 53 additions & 24 deletions src/main/java/org/jabref/logic/integrity/IntegrityCheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@ public class IntegrityCheck {
private final JournalAbbreviationRepository journalAbbreviationRepository;
private final boolean enforceLegalKey;
private final boolean allowIntegerEdition;
private ASCIICharacterChecker asciiCharacterChecker;
private NoBibtexFieldChecker noBibtexFieldChecker;
private BibTeXEntryTypeChecker bibTeXEntryTypeChecker;
private BibtexKeyChecker bibtexKeyChecker;
private TypeChecker typeChecker;
private BibStringChecker bibStringChecker;
private HTMLCharacterChecker htmlCharacterChecker;
private EntryLinkChecker entryLinkChecker;
private BibtexkeyDeviationChecker bibtexkeyDeviationChecker;
private BibtexKeyDuplicationChecker bibtexKeyDuplicationChecker;
private JournalInAbbreviationListChecker journalInAbbreviationListChecker;
private FieldCheckers fieldCheckers;
private DoiDuplicationChecker doiDuplicationChecker;

public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
FilePreferences filePreferences,
Expand All @@ -32,9 +45,36 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext,
this.journalAbbreviationRepository = Objects.requireNonNull(journalAbbreviationRepository);
this.enforceLegalKey = enforceLegalKey;
this.allowIntegerEdition = allowIntegerEdition;
initCheckers(bibDatabaseContext, bibtexKeyPatternPreferences, journalAbbreviationRepository);
}

public List<IntegrityMessage> checkDatabase() {
private void initCheckers(BibDatabaseContext bibDatabaseContext, BibtexKeyPatternPreferences bibtexKeyPatternPreferences, JournalAbbreviationRepository journalAbbreviationRepository) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I don't see any advantage of this refactoring. Just makes the code more complex and harder to maintain in my opinion.

If you really feel like the original code needs a refactoring, then you create a list of all checkers that need to be run a) always, b) bibtex c)biblatex (still I would create these lists only in the checkdatabase method). This would get ride of the repeated check methods below. But to be honest, I don't think this yields a more readable code either.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We initialize the checkers once per database check - not once per entry check. In case of a 20k database, approx 100k less java objects in memory per quality check.

We ensured that each checker is stateless - thus, it can be reused for acroess entries.

I agree with your proposal though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With your proposal, the code is much more reable. 🎉 03dd2a0

asciiCharacterChecker = new ASCIICharacterChecker();
noBibtexFieldChecker = new NoBibtexFieldChecker();
bibTeXEntryTypeChecker = new BibTeXEntryTypeChecker();
bibtexKeyChecker = new BibtexKeyChecker();
typeChecker = new TypeChecker();
bibStringChecker = new BibStringChecker();
htmlCharacterChecker = new HTMLCharacterChecker();
entryLinkChecker = new EntryLinkChecker(bibDatabaseContext.getDatabase());
bibtexkeyDeviationChecker = new BibtexkeyDeviationChecker(bibDatabaseContext, bibtexKeyPatternPreferences);
bibtexKeyDuplicationChecker = new BibtexKeyDuplicationChecker(bibDatabaseContext.getDatabase());
doiDuplicationChecker = new DoiDuplicationChecker(bibDatabaseContext.getDatabase());

if (bibDatabaseContext.isBiblatexMode()) {
journalInAbbreviationListChecker = new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository);
} else {
journalInAbbreviationListChecker = new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository);
}

fieldCheckers = new FieldCheckers(bibDatabaseContext,
filePreferences,
journalAbbreviationRepository,
enforceLegalKey,
allowIntegerEdition);
}

List<IntegrityMessage> checkDatabase() {
List<IntegrityMessage> result = new ArrayList<>();

for (BibEntry entry : bibDatabaseContext.getDatabase().getEntries()) {
Expand All @@ -51,38 +91,27 @@ public List<IntegrityMessage> checkEntry(BibEntry entry) {
return result;
}

FieldCheckers fieldCheckers = new FieldCheckers(bibDatabaseContext,
filePreferences,
journalAbbreviationRepository,
enforceLegalKey,
allowIntegerEdition);
for (FieldChecker checker : fieldCheckers.getAll()) {
result.addAll(checker.check(entry));
}

if (!bibDatabaseContext.isBiblatexMode()) {
// BibTeX only checkers
result.addAll(new ASCIICharacterChecker().check(entry));
result.addAll(new NoBibtexFieldChecker().check(entry));
result.addAll(new BibTeXEntryTypeChecker().check(entry));
result.addAll(new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository).check(entry));
} else {
result.addAll(new JournalInAbbreviationListChecker(StandardField.JOURNALTITLE, journalAbbreviationRepository).check(entry));
result.addAll(asciiCharacterChecker.check(entry));
result.addAll(noBibtexFieldChecker.check(entry));
result.addAll(bibTeXEntryTypeChecker.check(entry));
}

result.addAll(new BibtexKeyChecker().check(entry));
result.addAll(new TypeChecker().check(entry));
result.addAll(new BibStringChecker().check(entry));
result.addAll(new HTMLCharacterChecker().check(entry));
result.addAll(new EntryLinkChecker(bibDatabaseContext.getDatabase()).check(entry));
result.addAll(new BibtexkeyDeviationChecker(bibDatabaseContext, bibtexKeyPatternPreferences).check(entry));
result.addAll(new BibtexKeyDuplicationChecker(bibDatabaseContext.getDatabase()).check(entry));
result.addAll(journalInAbbreviationListChecker.check(entry));
result.addAll(bibtexKeyChecker.check(entry));
result.addAll(typeChecker.check(entry));
result.addAll(bibStringChecker.check(entry));
result.addAll(htmlCharacterChecker.check(entry));
result.addAll(entryLinkChecker.check(entry));
result.addAll(bibtexkeyDeviationChecker.check(entry));
result.addAll(bibtexKeyDuplicationChecker.check(entry));
result.addAll(doiDuplicationChecker.check(entry));

return result;
}

@FunctionalInterface
public interface Checker {
List<IntegrityMessage> check(BibEntry entry);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
import org.jabref.model.entry.field.Field;

public final class IntegrityMessage implements Cloneable {

private final BibEntry entry;

private final Field field;
private final String message;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.journals.JournalAbbreviationRepository;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
Expand All @@ -24,7 +23,7 @@ public JournalInAbbreviationListChecker(Field field, JournalAbbreviationReposito
@Override
public List<IntegrityMessage> check(BibEntry entry) {
Optional<String> value = entry.getField(field);
if (!value.isPresent()) {
if (value.isEmpty()) {
return Collections.emptyList();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.Set;
import java.util.stream.Collectors;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.BibField;
Expand Down
3 changes: 1 addition & 2 deletions src/main/java/org/jabref/logic/integrity/TypeChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import java.util.List;
import java.util.Optional;

import org.jabref.logic.integrity.IntegrityCheck.Checker;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
Expand All @@ -15,7 +14,7 @@ public class TypeChecker implements Checker {
@Override
public List<IntegrityMessage> check(BibEntry entry) {
Optional<String> value = entry.getField(StandardField.PAGES);
if (!value.isPresent()) {
if (value.isEmpty()) {
return Collections.emptyList();
}

Expand Down
Loading