Skip to content

Commit

Permalink
added annex figures, tables, equations
Browse files Browse the repository at this point in the history
  • Loading branch information
de-code committed Apr 14, 2021
1 parent 17f2d46 commit 3605866
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1131,15 +1131,18 @@ public StringBuilder toTEIAnnex(StringBuilder buffer,
BiblioItem biblio,
List<BibDataSet> bds,
List<LayoutToken> tokenizations,
Document doc,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
Document doc,
GrobidAnalysisConfig config) throws Exception {
if ((result == null) || (tokenizations == null)) {
return buffer;
}

buffer.append("\t\t\t<div type=\"annex\">\n");
buffer = toTEITextPiece(buffer, result, biblio, bds, true,
new LayoutTokenization(tokenizations), null, null, null, doc, config);
new LayoutTokenization(tokenizations), figures, tables, equations, doc, config);
buffer.append("\t\t\t</div>\n");

return buffer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,29 +250,10 @@ else if (config.getConsolidateCitations() == 2)

// we apply now the figure and table models based on the fulltext labeled output
figures = processFigures(resultBody, layoutTokenization.getTokenization(), doc);
// further parse the caption
for(Figure figure : figures) {
if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc);
figure.setLabeledCaption(captionProcess.getLeft());
figure.setCaptionLayoutTokens(captionProcess.getRight());
}
}
postProcessFigureCaptions(figures, doc);

tables = processTables(resultBody, layoutTokenization.getTokenization(), doc);
// further parse the caption
for(Table table : tables) {
if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(table.getCaptionLayoutTokens(), doc);
table.setLabeledCaption(captionProcess.getLeft());
table.setCaptionLayoutTokens(captionProcess.getRight());
}
if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) {
Pair<String, List<LayoutToken>> noteProcess = processShort(table.getNoteLayoutTokens(), doc);
table.setLabeledNote(noteProcess.getLeft());
table.setNoteLayoutTokens(noteProcess.getRight());
}
}
postProcessTableCaptions(tables, doc);

equations = processEquations(resultBody, layoutTokenization.getTokenization(), doc);
} else {
Expand All @@ -283,6 +264,9 @@ else if (config.getConsolidateCitations() == 2)
documentBodyParts = doc.getDocumentPart(SegmentationLabels.ANNEX);
featSeg = getBodyTextFeatured(doc, documentBodyParts);
String resultAnnex = null;
List<Figure> annexFigures = null;
List<Table> annexTables = null;
List<Equation> annexEquations = null;
List<LayoutToken> tokenizationsBody2 = null;
if (featSeg != null && isNotEmpty(trim(featSeg.getLeft()))) {
// if featSeg is null, it usually means that no body segment is found in the
Expand All @@ -291,6 +275,14 @@ else if (config.getConsolidateCitations() == 2)
tokenizationsBody2 = featSeg.getRight().getTokenization();
resultAnnex = label(bodytext);
//System.out.println(rese);

annexFigures = processFigures(resultAnnex, tokenizationsBody2, doc);
postProcessFigureCaptions(annexFigures, doc);

annexTables = processTables(resultAnnex, tokenizationsBody2, doc);
postProcessTableCaptions(annexTables, doc);

annexEquations = processEquations(resultAnnex, tokenizationsBody2, doc);
}

// final combination
Expand All @@ -299,6 +291,7 @@ else if (config.getConsolidateCitations() == 2)
layoutTokenization, tokenizationsBody2, // tokenization for body and annex
resHeader, // header
figures, tables, equations,
annexFigures, annexTables, annexEquations,
config);
return doc;
} catch (GrobidException e) {
Expand Down Expand Up @@ -1930,6 +1923,19 @@ protected List<Figure> processFigures(String rese, List<LayoutToken> layoutToken
return results;
}

protected void postProcessFigureCaptions(
List<Figure> figures,
Document doc
) {
// further parse the caption
for(Figure figure : figures) {
if (CollectionUtils.isNotEmpty(figure.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(figure.getCaptionLayoutTokens(), doc);
figure.setLabeledCaption(captionProcess.getLeft());
figure.setCaptionLayoutTokens(captionProcess.getRight());
}
}
}

/**
* Create training data for the figures as identified by the full text model.
Expand Down Expand Up @@ -2103,6 +2109,24 @@ protected List<Table> processTables(String rese,
return results;
}

protected void postProcessTableCaptions(
List<Table> tables,
Document doc
) {
// further parse the caption
for(Table table : tables) {
if ( CollectionUtils.isNotEmpty(table.getCaptionLayoutTokens()) ) {
Pair<String, List<LayoutToken>> captionProcess = processShort(table.getCaptionLayoutTokens(), doc);
table.setLabeledCaption(captionProcess.getLeft());
table.setCaptionLayoutTokens(captionProcess.getRight());
}
if ( CollectionUtils.isNotEmpty(table.getNoteLayoutTokens())) {
Pair<String, List<LayoutToken>> noteProcess = processShort(table.getNoteLayoutTokens(), doc);
table.setLabeledNote(noteProcess.getLeft());
table.setNoteLayoutTokens(noteProcess.getRight());
}
}
}

/**
* Create training data for the table as identified by the full text model.
Expand Down Expand Up @@ -2312,6 +2336,9 @@ private void toTEI(Document doc,
List<Figure> figures,
List<Table> tables,
List<Equation> equations,
List<Figure> annexFigures,
List<Table> annexTables,
List<Equation> annexEquations,
GrobidAnalysisConfig config) {
if (doc.getBlocks() == null) {
return;
Expand Down Expand Up @@ -2348,7 +2375,10 @@ private void toTEI(Document doc,
}

tei = teiFormatter.toTEIAnnex(tei, reseAnnex, resHeader, resCitations,
tokenizationsAnnex, doc, config);
tokenizationsAnnex,
annexFigures, annexTables, annexEquations,
doc, config
);

tei = teiFormatter.toTEIReferences(tei, resCitations, config);
doc.calculateTeiIdToBibDataSets();
Expand Down

0 comments on commit 3605866

Please sign in to comment.