Skip to content

Commit

Permalink
Write table content to TEI
Browse files Browse the repository at this point in the history
  • Loading branch information
Vitaliy-1 committed Feb 25, 2020
1 parent d51a057 commit ca98798
Showing 1 changed file with 42 additions and 1 deletion.
43 changes: 42 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
import org.grobid.core.data.table.Line;
import org.grobid.core.data.table.LinePart;
import org.grobid.core.data.table.Row;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.document.Document;
import org.grobid.core.document.TEIFormatter;
Expand Down Expand Up @@ -131,7 +135,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form


Element contentEl = XmlBuilderUtils.teiElement("table");
contentEl.appendChild(LayoutTokensUtil.toText(getContentTokens()));
processTableContent(contentEl, this.getContentTokens());
if ((config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("figure"))) {
XmlBuilderUtils.addCoords(contentEl, LayoutTokensUtil.getCoordsStringForOneBox(getContentTokens()));
}
Expand Down Expand Up @@ -179,6 +183,43 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
// return theTable.toString();
}

/**
*
* @param contentEl table element to append parsed rows and cells.
* @param contentTokens tokens that are used to build cells
* Line-based algorithm for parsing tables, uses tokens' coordinates to identify lines
*/
void processTableContent(Element contentEl, List<LayoutToken> contentTokens) {
// Join Layout Tokens into cell lines originally created by PDFAlto
List<LinePart> lineParts = Line.extractLineParts(contentTokens);

// Build lines by comparing borders
List<Line> lines = Line.extractLines(lineParts);

// Build rows and cells
List<Row> rows = Row.extractRows(lines);

int columnCount = Row.columnCount(rows);

Row.insertEmptyCells(rows, columnCount);

Row.mergeMulticolumnCells(rows);

for (Row row: rows) {
Element tr = XmlBuilderUtils.teiElement("row");
contentEl.appendChild(tr);
List<Cell> cells = row.getContent();
for (Cell cell: cells) {
Element td = XmlBuilderUtils.teiElement("cell");
tr.appendChild(td);
if (cell.getColspan() > 1) {
td.addAttribute(new Attribute("cols", Integer.toString(cell.getColspan())));
}
td.appendChild(cell.getText().trim());
}
}
}

private String cleanString(String input) {
return input.replace("\n", " ").replace(" ", " ").trim();
}
Expand Down

0 comments on commit ca98798

Please sign in to comment.