Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #2775: Hyphens in last names are properly parsed #3209

Merged
merged 1 commit into from
Sep 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
- We fixed wrong hotkey being displayed at "automatically file links" in the entry editor
- We fixed an issue where metadata syncing with local and shared database were unstable. It will also fix syncing groups and sub-groups in database. [#2284](https://github.com/JabRef/jabref/issues/2284)
- Renaming files now truncates the filename to not exceed the limit of 255 chars [#2622](https://github.com/JabRef/jabref/issues/2622)
- We improved the handling of hyphens in names. [#2775](https://github.com/JabRef/jabref/issues/2775)

### Removed
- We removed support for LatexEditor, as it is not under active development. [#3199](https://github.com/JabRef/jabref/issues/3199)
Expand Down
75 changes: 46 additions & 29 deletions src/main/java/org/jabref/model/entry/AuthorListParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,6 @@ public class AuthorListParser {
// Constant HashSet containing names of TeX special characters
private static final Set<String> TEX_NAMES = new HashSet<>();

/** the raw bibtex author/editor field */
private String original;

/** index of the start in original, for example to point to 'abc' in 'abc xyz', tokenStart=2 */
private int tokenStart;

/** index of the end in original, for example to point to 'abc' in 'abc xyz', tokenEnd=5 */
private int tokenEnd;

/** end of token abbreviation (always: tokenStart < tokenAbbr <= tokenEnd), only valid if getToken returns TOKEN_WORD */
private int tokenAbbr;


/** either space of dash */
private char tokenTerm;

/** true if upper-case token, false if lower-case */
private boolean tokenCase;

static {
TEX_NAMES.add("aa");
TEX_NAMES.add("ae");
Expand All @@ -66,6 +47,32 @@ public class AuthorListParser {
TEX_NAMES.add("j");
}

/**
* the raw bibtex author/editor field
*/
private String original;
/**
* index of the start in original, for example to point to 'abc' in 'abc xyz', tokenStart=2
*/
private int tokenStart;
/**
* index of the end in original, for example to point to 'abc' in 'abc xyz', tokenEnd=5
*/
private int tokenEnd;
/**
* end of token abbreviation (always: tokenStart < tokenAbbrEnd <= tokenEnd), only valid if getToken returns
* TOKEN_WORD
*/
private int tokenAbbrEnd;
/**
* either space of dash
*/
private char tokenTerm;
/**
* true if upper-case token, false if lower-case
*/
private boolean tokenCase;

/**
* Parses the String containing person names and returns a list of person information.
*
Expand Down Expand Up @@ -121,7 +128,7 @@ private Optional<Author> getAuthor() {
break;
case TOKEN_WORD:
tokens.add(original.substring(tokenStart, tokenEnd));
tokens.add(original.substring(tokenStart, tokenAbbr));
tokens.add(original.substring(tokenStart, tokenAbbrEnd));
tokens.add(tokenTerm);
tokens.add(tokenCase);
if (commaFirst >= 0) {
Expand All @@ -137,6 +144,13 @@ private Optional<Author> getAuthor() {
// We are in a first name which contained a hyphen
break;
}

int thisTermToken = previousTermToken + TOKEN_GROUP_LENGTH;
if ((thisTermToken >= 0) && tokens.get(thisTermToken).equals('-')) {
// We are in a name which contained a hyphen
break;
}

vonStart = tokens.size() - TOKEN_GROUP_LENGTH;
break;
}
Expand Down Expand Up @@ -194,14 +208,16 @@ private Optional<Author> getAuthor() {
firstPartStart = 0;
}
}
} else { // commas are present: it affects only 'first part' and
// 'junior part'
} else {
// commas are present: it affects only 'first part' and 'junior part'
firstPartEnd = tokens.size();
if (commaSecond < 0) { // one comma
if (commaSecond < 0) {
// one comma
if (commaFirst < firstPartEnd) {
firstPartStart = commaFirst;
}
} else { // two or more commas
} else {
// two or more commas
if (commaSecond < firstPartEnd) {
firstPartStart = commaSecond;
}
Expand Down Expand Up @@ -342,7 +358,7 @@ private int getToken() {
tokenEnd++;
return TOKEN_AND;
}
tokenAbbr = -1;
tokenAbbrEnd = -1;
tokenTerm = ' ';
tokenCase = true;
int bracesLevel = 0;
Expand All @@ -353,8 +369,9 @@ private int getToken() {
if (c == '{') {
bracesLevel++;
}
if (firstLetterIsFound && (tokenAbbr < 0) && ((bracesLevel == 0) || (c == '{'))) {
tokenAbbr = tokenEnd;

if (firstLetterIsFound && (tokenAbbrEnd < 0) && ((bracesLevel == 0) || (c == '{'))) {
tokenAbbrEnd = tokenEnd;
}
if ((c == '}') && (bracesLevel > 0)) {
bracesLevel--;
Expand Down Expand Up @@ -388,8 +405,8 @@ private int getToken() {
}
tokenEnd++;
}
if (tokenAbbr < 0) {
tokenAbbr = tokenEnd;
if (tokenAbbrEnd < 0) {
tokenAbbrEnd = tokenEnd;
}
if ((tokenEnd < original.length()) && (original.charAt(tokenEnd) == '-')) {
tokenTerm = '-';
Expand Down
27 changes: 23 additions & 4 deletions src/test/java/org/jabref/model/entry/AuthorListTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

public class AuthorListTest {

public static int size(String bibtex) {
return AuthorList.parse(bibtex).getNumberOfAuthors();
}

@Test
public void testFixAuthorNatbib() {
Assert.assertEquals("", AuthorList.fixAuthorNatbib(""));
Expand Down Expand Up @@ -286,10 +290,6 @@ public void testFixAuthorForAlphabetization() {
.fixAuthorForAlphabetization("John von Neumann and John Smith and de Black Brown, Jr., Peter"));
}

public static int size(String bibtex) {
return AuthorList.parse(bibtex).getNumberOfAuthors();
}

@Test
public void testSize() {

Expand Down Expand Up @@ -625,6 +625,25 @@ public void parseNameWithHyphenInLastName() throws Exception {
Assert.assertEquals(new AuthorList(expected), AuthorList.parse("Firstname Bailey-Jones"));
}

@Test
public void parseNameWithHyphenInLastNameWithInitials() throws Exception {
Author expected = new Author("E. S.", "E. S.", null, "El-{M}allah", null);
Assert.assertEquals(new AuthorList(expected), AuthorList.parse("E. S. El-{M}allah"));
}

@Test
public void parseNameWithHyphenInLastNameWithEscaped() throws Exception {
Author expected = new Author("E. S.", "E. S.", null, "{K}ent-{B}oswell", null);
Assert.assertEquals(new AuthorList(expected), AuthorList.parse("E. S. {K}ent-{B}oswell"));
}

@Test
public void parseNameWithHyphenInLastNameWhenLastNameGivenFirst() throws Exception {
// TODO: Fix abbreviation to be "A."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you get this working too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly no. I tried to fix it but the extraction of the abbreviation from the text is hidden in the whole tokenizing process and thus not easy to change.

Author expected = new Author("ʿAbdallāh", "ʿ.", null, "al-Ṣāliḥ", null);
Assert.assertEquals(new AuthorList(expected), AuthorList.parse("al-Ṣāliḥ, ʿAbdallāh"));
}

@Test
public void parseNameWithBraces() throws Exception {
Author expected = new Author("H{e}lene", "H.", null, "Fiaux", null);
Expand Down