Skip to content

Commit

Permalink
collect discarded data from the header, tables and figures and place …
Browse files Browse the repository at this point in the history
…it somewhere
  • Loading branch information
lfoppiano committed Dec 22, 2024
1 parent 09b28cd commit 4fde054
Show file tree
Hide file tree
Showing 14 changed files with 270 additions and 109 deletions.
17 changes: 15 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import org.grobid.core.data.util.AuthorEmailAssigner;
import org.grobid.core.data.util.ClassicAuthorEmailAssigner;
import org.grobid.core.data.util.EmailSanitizer;
import org.grobid.core.data.CopyrightsLicense;
import org.grobid.core.document.*;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.exceptions.GrobidException;
Expand All @@ -22,7 +21,6 @@
import org.grobid.core.utilities.KeyGen;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.GrobidModels;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;

import java.net.URLEncoder;
Expand Down Expand Up @@ -382,6 +380,9 @@ public String toString() {
// Copyrights/license information object
CopyrightsLicense copyrightsLicense = null;

// All the tokens that are considered noise will be collected here
private List<List<LayoutToken>> discardedPiecesTokens = new ArrayList<>();

public static final List<String> confPrefixes = Arrays.asList("Proceedings of", "proceedings of",
"In Proceedings of the", "In: Proceeding of", "In Proceedings, ", "In Proceedings of",
"In Proceeding of", "in Proceeding of", "in Proceeding", "In Proceeding", "Proceedings",
Expand Down Expand Up @@ -4522,4 +4523,16 @@ public void setCopyrightsLicense(CopyrightsLicense copyrightsLicense) {
public CopyrightsLicense getCopyrightsLicense() {
return this.copyrightsLicense;
}

public List<List<LayoutToken>> getDiscardedPiecesTokens() {
return discardedPiecesTokens;
}

public void setDiscardedPiecesTokens(List<List<LayoutToken>> discardedPiecesTokens) {
this.discardedPiecesTokens = discardedPiecesTokens;
}

public void addDiscardedPieceTokens(List<LayoutToken> pieceToken) {
this.discardedPiecesTokens.add(pieceToken);
}
}
14 changes: 14 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ public boolean apply(GraphicObject graphicObject) {
private List<BoundingBox> textArea;
private List<LayoutToken> layoutTokens;

private List<List<LayoutToken>> discardedPiecesTokens = new ArrayList<>();

// coordinates
private int page = -1;
private double y = 0.0;
Expand Down Expand Up @@ -568,4 +570,16 @@ public void setLabel(StringBuilder label) {
public void setUri(URI uri) {
this.uri = uri;
}

public List<List<LayoutToken>> getDiscardedPiecesTokens() {
return discardedPiecesTokens;
}

public void setDiscardedPiecesTokens(List<List<LayoutToken>> discardedPiecesTokens) {
this.discardedPiecesTokens = discardedPiecesTokens;
}

public void addDiscardedPieceTokens(List<LayoutToken> pieceToken) {
this.discardedPiecesTokens.add(pieceToken);
}
}
14 changes: 14 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ public class Table extends Figure {
private List<LayoutToken> noteLayoutTokens = null;
private String labeledNote = null;

private List<List<LayoutToken>> discardedPiecesTokens = new ArrayList<>();


public void setGoodTable(boolean goodTable) {
this.goodTable = goodTable;
Expand Down Expand Up @@ -423,4 +425,16 @@ public boolean isGoodTable() {
public String getTeiId() {
return "tab_" + this.id;
}

public List<List<LayoutToken>> getDiscardedPiecesTokens() {
return discardedPiecesTokens;
}

public void setDiscardedPiecesTokens(List<List<LayoutToken>> discardedPiecesTokens) {
this.discardedPiecesTokens = discardedPiecesTokens;
}

public void addDiscardedPieceTokens(List<LayoutToken> pieceToken) {
this.discardedPiecesTokens.add(pieceToken);
}
}
164 changes: 103 additions & 61 deletions grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,25 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
List<MarkerType> markerTypes,
List<Funding> fundings,
GrobidAnalysisConfig config) {
return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, markerTypes, fundings, config);
return toTEIHeader(
biblio,
SchemaDeclaration.XSD,
defaultPublicationStatement,
bds,
markerTypes,
fundings,
config
);
}

public StringBuilder toTEIHeader(BiblioItem biblio,
SchemaDeclaration schemaDeclaration,
String defaultPublicationStatement,
List<BibDataSet> bds,
List<MarkerType> markerTypes,
List<Funding> fundings,
GrobidAnalysisConfig config) {
public StringBuilder toTEIHeader(
BiblioItem biblio,
SchemaDeclaration schemaDeclaration,
String defaultPublicationStatement,
List<BibDataSet> bds,
List<MarkerType> markerTypes,
List<Funding> fundings,
GrobidAnalysisConfig config) {
StringBuilder tei = new StringBuilder();
tei.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
if (config.isWithXslStylesheet()) {
Expand Down Expand Up @@ -153,11 +162,9 @@ public StringBuilder toTEIHeader(BiblioItem biblio,

if (config.isGenerateTeiCoordinates("title")) {
List<LayoutToken> titleTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE);
if (titleTokens != null && titleTokens.size()>0) {
if (CollectionUtils.isNotEmpty(titleTokens)) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null) {
tei.append(" coords=\"" + coords + "\"");
}
tei.append(" coords=\"" + coords + "\"");
}
}

Expand All @@ -174,8 +181,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio,

tei.append("</title>\n");

if (fundings != null && fundings.size()>0) {

if (CollectionUtils.isNotEmpty(fundings)) {
Map<String,Funder> funderSignatures = new TreeMap<>();
for(Funding funding : fundings) {
if (funding.getFunder() != null && funding.getFunder().getFullName() != null) {
Expand Down Expand Up @@ -218,28 +224,30 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
for (Map.Entry<Funder, List<Funding>> entry : fundingRelation.entrySet()) {
String funderPiece = null;
Funder consolidatedFunder = null;
if (consolidatedFunders != null)
if (consolidatedFunders != null) {
consolidatedFunder = consolidatedFunders.get(n);
}

if (consolidatedFunder != null && config.getConsolidateFunders() == 1) {
funderPiece = consolidatedFunder.toTEI(4);
} else if (consolidatedFunder != null && config.getConsolidateFunders() == 2) {
Funder localFunder = entry.getKey();
localFunder.setDoi(consolidatedFunder.getDoi());
funderPiece = localFunder.toTEI(4);
} else
} else {
funderPiece = entry.getKey().toTEI(4);
}

// inject funding ref in the funder entries
String referenceString = "";
StringBuilder referenceString = new StringBuilder();
for(Funding funderFunding : entry.getValue()) {
if (funderFunding.isNonEmptyFunding())
referenceString += " #" + funderFunding.getIdentifier();
referenceString.append(" #").append(funderFunding.getIdentifier());
}

if (funderPiece != null) {
if (referenceString.length()>0)
funderPiece = funderPiece.replace("<funder>", "<funder ref=\"" + referenceString.trim() + "\">");
funderPiece = funderPiece.replace("<funder>", "<funder ref=\"" + referenceString.toString().trim() + "\">");
tei.append(funderPiece);
}
n++;
Expand Down Expand Up @@ -440,11 +448,9 @@ else if (biblio.getE_Year().length() == 4)

if (config.isGenerateTeiCoordinates("title")) {
List<LayoutToken> titleTokens = biblio.getLayoutTokens(TaggingLabels.HEADER_TITLE);
if (titleTokens != null && titleTokens.size()>0) {
if (CollectionUtils.isNotEmpty(titleTokens)) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null) {
tei.append(" coords=\"" + coords + "\"");
}
tei.append(" coords=\"" + coords + "\"");
}
}

Expand Down Expand Up @@ -537,7 +543,7 @@ else if (biblio.getE_Year().length() == 4)
// // TODO:
// }

// in case the booktitle corresponds to a proceedings, we can try to indicate the meeting title
// in case the book title corresponds to a proceedings, we can try to indicate the meeting title
String meeting = biblio.getBookTitle();
boolean meetLoc = false;
if (biblio.getEvent() != null)
Expand Down Expand Up @@ -592,11 +598,14 @@ else if (meeting != null) {

String pageRange = biblio.getPageRange();

if ((biblio.getVolumeBlock() != null) | (biblio.getPublicationDate() != null) |
(biblio.getNormalizedPublicationDate() != null) |
(pageRange != null) | (biblio.getIssue() != null) |
(biblio.getBeginPage() != -1) |
(biblio.getPublisher() != null)) {
if (biblio.getVolumeBlock() != null
|| biblio.getPublicationDate() != null
|| biblio.getNormalizedPublicationDate() != null
|| pageRange != null
|| biblio.getIssue() != null
|| biblio.getBeginPage() != -1
|| biblio.getPublisher() != null) {

tei.append("\t\t\t\t\t\t<imprint>\n");

if (biblio.getPublisher() != null) {
Expand Down Expand Up @@ -814,6 +823,24 @@ else if (biblio.getE_Year().length() == 4)
}

tei.append("\t\t\t</sourceDesc>\n");

// We collect the discarded text from the header and add it as a <noteStmt>
if(config.isIncludeDiscardedText()) {
tei.append("\t\t\t<notesStmt>\n");
for (List<LayoutToken> discardedPieceTokens : biblio.getDiscardedPiecesTokens()) {
LayoutToken first = Iterables.getFirst(discardedPieceTokens, null);
String place = first == null ? "unknown" : first.getLabels().get(0).getGrobidModel().getModelName();

tei.append("\t\t\t\t<note type=\"other\" place=\"" + place + "\"");
if (generateIDs) {
String divID = KeyGen.getKey().substring(0, 7);
tei.append(" xml:id=\"_" + divID + "\"");
}
tei.append(">" + TextUtilities.HTMLEncode(normalizeText(LayoutTokensUtil.toText(discardedPieceTokens))) + "</note>\n");
}
tei.append("\t\t\t</notesStmt>\n");
}

tei.append("\t\t</fileDesc>\n");

// encodingDesc gives info about the producer of the file
Expand Down Expand Up @@ -923,24 +950,25 @@ else if (biblio.getE_Year().length() == 4)
tei.append("\t\t\t<abstract>\n");
}

if ((abstractText != null) && (abstractText.length() != 0)) {
if ( (biblio.getLabeledAbstract() != null) && (biblio.getLabeledAbstract().length() > 0) ) {
if (StringUtils.isNotBlank(abstractText)) {
if (StringUtils.isNotBlank (biblio.getLabeledAbstract()) ) {
// we have available structured abstract, which can be serialized as a full text "piece"
StringBuilder buffer = new StringBuilder();
try {
buffer = toTEITextPiece(buffer,
biblio.getLabeledAbstract(),
biblio,
bds,
false,
new LayoutTokenization(biblio.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT)),
null,
null,
null,
null,
markerTypes,
doc,
config); // no figure, no table, no equation
buffer = toTEITextPiece(
buffer,
biblio.getLabeledAbstract(),
biblio,
bds,
false,
new LayoutTokenization(biblio.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT)),
null,
null,
null,
null,
markerTypes,
doc,
config); // no figure, no table, no equation
} catch(Exception e) {
throw new GrobidException("An exception occurred while serializing TEI.", e);
}
Expand Down Expand Up @@ -1065,8 +1093,21 @@ public StringBuilder toTEIBody(StringBuilder buffer,

List<Note> notes = getTeiNotes(doc);

buffer = toTEITextPiece(buffer, result, biblio, bds, true,
layoutTokenization, figures, tables, equations, notes, markerTypes, doc, config);
buffer = toTEITextPiece(
buffer,
result,
biblio,
bds,
true,
layoutTokenization,
figures,
tables,
equations,
notes,
markerTypes,
doc,
config
);

// notes are still in the body
buffer = toTEINote(buffer, notes, doc, markerTypes, config);
Expand Down Expand Up @@ -1396,19 +1437,20 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,
return buffer;
}

public StringBuilder toTEITextPiece(StringBuilder buffer,
String result,
BiblioItem biblio,
List<BibDataSet> bds,
boolean keepUnsolvedCallout,
LayoutTokenization layoutTokenization,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
List<Note> notes,
List<MarkerType> markerTypes,
Document doc,
GrobidAnalysisConfig config) throws Exception {
public StringBuilder toTEITextPiece(
StringBuilder buffer,
String result,
BiblioItem biblio,
List<BibDataSet> bds,
boolean keepUnsolvedCallout,
LayoutTokenization layoutTokenization,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
List<Note> notes,
List<MarkerType> markerTypes,
Document doc,
GrobidAnalysisConfig config) throws Exception {
TaggingLabel lastClusterLabel = null;
int startPosition = buffer.length();

Expand Down Expand Up @@ -1472,7 +1514,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
clusterLabel.equals(TaggingLabels.EQUATION_LABEL)) {
// get starting position of the cluster
int start = -1;
if ( (cluster.concatTokens() != null) && (cluster.concatTokens().size() > 0) ) {
if (CollectionUtils.isNotEmpty (cluster.concatTokens()) ) {
start = cluster.concatTokens().get(0).getOffset();
}
// get the corresponding equation
Expand Down Expand Up @@ -2505,7 +2547,7 @@ public List<Node> markReferencesEquationTEI(String text,
String bestFormula = null;
if (equations != null) {
for (Equation equation : equations) {
if ((equation.getLabel() != null) && (equation.getLabel().length() > 0)) {
if (StringUtils.isNotBlank(equation.getLabel())) {
String label = TextUtilities.cleanField(equation.getLabel(), false);
Matcher m2 = patternNumber.matcher(label);
String labelNumber = null;
Expand Down
Loading

0 comments on commit 4fde054

Please sign in to comment.