Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for application/x-bibtex type #532

Merged
merged 11 commits into from
Mar 9, 2020
6 changes: 6 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ indent_size=2
indent_style=space
indent_size=2

[*.md]
insert_final_newline=true

[{*.yml,*.yaml}]
indent_style=space
indent_size=2

[GrobidRestProcessString.java]
indent_style=tab
5 changes: 5 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@ buildscript {
mavenLocal()
mavenCentral()
jcenter()
maven {
url 'https://plugins.gradle.org/m2/'
}
}
dependencies {
classpath group: 'net.researchgate', name: 'gradle-release', version: '2.6.0'
classpath 'org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.4.0'
classpath 'com.jfrog.bintray.gradle:gradle-bintray-plugin:1.7.3'
classpath 'com.github.jengelman.gradle.plugins:shadow:5.0.0'
classpath 'com.adarshr:gradle-test-logger-plugin:2.0.0'
}
}

Expand All @@ -25,6 +29,7 @@ allprojects {
apply plugin: 'jacoco'
apply plugin: 'base'
apply plugin: 'com.github.kt3k.coveralls'
apply plugin: 'com.adarshr.test-logger'

group = "org.grobid"

Expand Down
377 changes: 201 additions & 176 deletions doc/Grobid-service.md

Large diffs are not rendered by default.

166 changes: 92 additions & 74 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -1727,132 +1728,139 @@ else if (string.startsWith("PACS Numbers") ||
}

/**
* Export to BibTeX format
* Export to BibTeX format. Use "id" as BibTeX key.
*/
public String toBibTeX() {
return toBibTeX("id");
}

/**
* Export to BibTeX format
*
* @param id the BibTeX ke to use.
*/
public String toBibTeX(String id) {
String bibtex = "";
try {
return toBibTeX(id, new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().includeRawCitations(false).build());
}

if (journal != null) {
bibtex += "@article{" + id + ",\n";
} else if (book_type != null) {
bibtex += "@techreport{" + id + ",\n";
} else if (bookTitle != null) {
if ((bookTitle.startsWith("proc")) || (bookTitle.startsWith("Proc")) ||
(bookTitle.startsWith("In Proc")) || (bookTitle.startsWith("In proc"))) {
bibtex += "@inproceedings{" + id + ",\n";
} else {
bibtex += "@article{" + id + ",\n"; // ???
}
/**
* Export to BibTeX format
*
* @param id the BibTeX ke to use
*/
public String toBibTeX(String id, GrobidAnalysisConfig config) {
String type;
if (journal != null) {
type = "article";
} else if (book_type != null) {
type = "techreport";
} else if (bookTitle != null) {
if (StringUtils.containsIgnoreCase(bookTitle, "proceedings") ||
(bookTitle.startsWith("proc")) || (bookTitle.startsWith("Proc")) ||
(bookTitle.startsWith("In Proc")) || (bookTitle.startsWith("In proc"))) {
type = "inproceedings";
} else {
bibtex += "@misc{" + id + ",\n"; // ???
LOGGER.debug("No journal given, but a booktitle. However, the booktitle does not start with \"proc\" or similar strings. Returning inbook");
type = "inbook";
}
} else {
// using "misc" as fallback type
type = "misc";
}

StringJoiner bibtex = new StringJoiner(",\n", "@" + type + "{" + id + ",\n", "\n}\n");

try {

// author
// fullAuthors has to be used instead
if (collaboration != null) {
bibtex += "author\t=\t\"" + collaboration;
} else if (fullAuthors != null) {
if (fullAuthors.size() > 0) {
boolean begin = true;
for (Person person : fullAuthors) {
if (begin) {
bibtex += "author\t=\t\"" + person.getFirstName() + " " + person.getLastName();
begin = false;
} else
bibtex += " and " + person.getFirstName() + " " + person.getLastName();
}
bibtex += "\"";
}
} else if (authors != null) {
StringTokenizer st = new StringTokenizer(authors, ";");
if (st.countTokens() > 1) {
boolean begin = true;
bibtex.add(" author = {" + collaboration + "}");
} else {
StringJoiner authors = new StringJoiner(" and ", " author = {", "}");
if (fullAuthors != null) {
fullAuthors.stream()
.filter(person -> person != null)
.forEachOrdered(person -> {
String author = person.getLastName();
if (person.getFirstName() != null) {
author += ", ";
author += person.getFirstName();
}
authors.add(author);
});
} else if (this.authors != null) {
StringTokenizer st = new StringTokenizer(this.authors, ";");
while (st.hasMoreTokens()) {
String author = st.nextToken();
if (author != null)
author = author.trim();
if (begin) {
bibtex += "author\t=\t\"" + author;
begin = false;
} else
bibtex += " and " + author;

if (author != null) {
authors.add(author.trim());
}
}
bibtex += "\"";
} else {
if (authors != null)
bibtex += "author\t=\t\"" + authors + "\"";
}
bibtex.add(authors.toString());
}

// title
if (title != null) {
bibtex += ",\ntitle\t=\t\"" + title + "\"";
bibtex.add(" title = {" + title + "}");
}

// journal
if (journal != null) {
bibtex += ",\njournal\t=\t\"" + journal + "\"";
bibtex.add(" journal = {" + journal + "}");
}

// booktitle
if ((journal == null) && (book_type == null) && (bookTitle != null)) {
bibtex += ",\nbooktitle\t=\t\"" + bookTitle + "\"";
bibtex.add(" booktitle = {" + bookTitle + "}");
}

// publisher
if (publisher != null) {
bibtex += ",\npublisher\t=\t\"" + publisher + "\"";
bibtex.add(" publisher = {" + publisher + "}");
}

// editors
if (editors != null) {
String locEditors = editors.replace(" ; ", " and ");
bibtex += ",\neditor\t=\t\"" + locEditors + "\"";
bibtex.add(" editor = {" + locEditors + "}");
}
// fullEditors has to be used instead

// year
if (publication_date != null) {
bibtex += ",\nyear\t=\t\"" + publication_date + "\"";
bibtex.add(" year = {" + publication_date + "}");
}

// location
// address
if (location != null) {
bibtex += ",\naddress\t=\t\"" + location + "\"";
bibtex.add(" address = {" + location + "}");
}

// pages
if (pageRange != null) {
bibtex += ",\npages\t=\t\"" + pageRange + "\"";
bibtex.add(" pages = {" + pageRange + "}");
}

// volume
if (volumeBlock != null) {
bibtex += ",\nvolume\t=\t\"" + volumeBlock + "\"";
bibtex.add(" volume = {" + volumeBlock + "}");
}

// issue (named number in BibTeX)
if (issue != null) {
bibtex += ",\nnumber\t=\t\"" + issue + "\"";
bibtex.add(" number = {" + issue + "}");
}

// DOI
if (!StringUtils.isEmpty(doi)) {
bibtex += ",\ndoi\t=\t\"" + doi + "\"";
bibtex.add(" doi = {" + doi + "}");
}

// arXiv identifier
if (!StringUtils.isEmpty(arXivId)) {
bibtex += ",\neprint\t=\t\"" + arXivId + "\"";
bibtex.add(" eprint = {" + arXivId + "}");
}
/* note that the following is now recommended for arXiv citations:
archivePrefix = "arXiv",
Expand All @@ -1864,30 +1872,27 @@ public String toBibTeX(String id) {

// abstract
if (!StringUtils.isEmpty(abstract_)) {
bibtex += ",\nabstract\t=\t\"" + abstract_ + "\"";
bibtex.add(" abstract = {" + abstract_ + "}");
}

// keywords
if (keywords != null) {
bibtex += ",\nkeywords\t=\t\"";
boolean begin = true;
for (Keyword keyw : keywords) {
if ( (keyw.getKeyword() == null) || (keyw.getKeyword().length() == 0) )
continue;
if (begin) {
begin = false;
bibtex += keyw.getKeyword();
} else
bibtex += ", " + keyw.getKeyword();
}
bibtex += "\"";
String value = keywords.stream()
.map(keyword -> keyword.getKeyword())
.filter(keyword -> !StringUtils.isBlank(keyword))
.collect(Collectors.joining(", ", "keywords = {", "}"));
bibtex.add(value);
}

bibtex += "\n}\n";
if (config.getIncludeRawCitations() && !StringUtils.isEmpty(reference) ) {
// escape all " signs
bibtex.add(" raw = {" + reference + "}");
}
} catch (Exception e) {
LOGGER.error("Cannot export BibTex format, because of nested exception.", e);
throw new GrobidException("Cannot export BibTex format, because of nested exception.", e);
}
return bibtex;
return bibtex.toString();
}

/**
Expand Down Expand Up @@ -1940,16 +1945,29 @@ public void checkIdentifier() {
*
* @param n - the index of the bibliographical record, the corresponding id will be b+n
*/

public String toTEI(int n) {
return toTEI(n, 0, GrobidAnalysisConfig.defaultInstance());
}

/**
* Export the bibliographical item into a TEI BiblStruct string
*
* @param n - the index of the bibliographical record, the corresponding id will be b+n
*/
public String toTEI(int n, GrobidAnalysisConfig config) {
return toTEI(n, 0, config);
}

/**
* Export the bibliographical item into a TEI BiblStruct string
*
* @param n - the index of the bibliographical record, the corresponding id will be b+n
* @param indent - the tabulation indentation for the output of the xml elements
*/
public String toTEI(int n, int indent) {
return toTEI(n, indent, GrobidAnalysisConfig.defaultInstance());
}


/**
* Export the bibliographical item into a TEI BiblStruct string
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
Expand All @@ -45,6 +47,8 @@
* @author Patrice Lopez
*/
public class CitationParser extends AbstractParser {
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractParser.class);

public Lexicon lexicon = Lexicon.getInstance();
private EngineParsers parsers;

Expand All @@ -70,7 +74,10 @@ public BiblioItem processing(String input, int consolidate) {
//input = input.replaceAll("\\p{Cntrl}", " ").trim();

List<LayoutToken> tokens = analyzer.tokenizeWithLayoutToken(input);
return processing(tokens, consolidate);
BiblioItem biblioItem = processing(tokens, consolidate);
// store original references to enable raw output
biblioItem.setReference(input);
return biblioItem;
}

public BiblioItem processing(List<LayoutToken> tokens, int consolidate) {
Expand Down Expand Up @@ -146,6 +153,7 @@ public BiblioItem processing(List<LayoutToken> tokens, int consolidate) {

return resCitation;
} catch (Exception e) {
LOGGER.error("An exception occured while running Grobid.", e);
throw new GrobidException(
"An exception occured while running Grobid.", e);
}
Expand All @@ -171,7 +179,7 @@ public List<BibDataSet> processingReferenceSection(String referenceTextBlock, Re
}

public List<BibDataSet> processingReferenceSection(Document doc, ReferenceSegmenter referenceSegmenter, int consolidate) {
List<BibDataSet> results = new ArrayList<BibDataSet>();
List<BibDataSet> results = new ArrayList<>();

String referencesStr = doc.getDocumentPartText(SegmentationLabels.REFERENCES);

Expand Down Expand Up @@ -292,8 +300,10 @@ public List<BibDataSet> processingReferenceSection(DocumentSource documentSource
GrobidAnalysisConfig.builder().consolidateCitations(consolidate).build());
results = processingReferenceSection(doc, referenceSegmenter, consolidate);
} catch (GrobidException e) {
LOGGER.error("An exception occured while running Grobid.", e);
throw e;
} catch (Exception e) {
LOGGER.error("An exception occured while running Grobid.", e);
throw new GrobidException("An exception occurred while running Grobid.", e);
}

Expand Down Expand Up @@ -448,9 +458,9 @@ else if (consolidate == 2)
BiblioItem.injectDOI(resCitation, bibo);
}
} catch (Exception e) {
// e.printStackTrace();
LOGGER.error("An exception occurred while running bibliographical data consolidation.", e);
throw new GrobidException(
"An exception occured while running bibliographical data consolidation.", e);
"An exception occurred while running bibliographical data consolidation.", e);
}
return resCitation;
}
Expand Down
Loading