Skip to content

Commit

Permalink
correctly rename the training data when the original PDF contain the …
Browse files Browse the repository at this point in the history
…word `.pdf` #776
  • Loading branch information
lfoppiano committed Nov 28, 2024
1 parent 37dcec1 commit 23d4f25
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1112,8 +1112,7 @@ public Document createTraining(File inputFile,
List<LayoutToken> tokenizations = doc.getTokenizations();

// we write first the full text untagged (but featurized with segmentation features)
String outPathFulltext = pathFullText + File.separator +
pdfFileName.replace(".pdf", ".training.segmentation");
String outPathFulltext = pathFullText + File.separator + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.segmentation");
Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), StandardCharsets.UTF_8);
writer.write(fulltext + "\n");
writer.close();
Expand All @@ -1124,7 +1123,7 @@ public Document createTraining(File inputFile,
rawtxt.append(txtline.getText());
}
String outPathRawtext = pathFullText + File.separator +
pdfFileName.replace(".pdf", ".training.segmentation.rawtxt");
pdfFileName.replaceAll("(?i)\\.pdf$", ".training.segmentation.rawtxt");
FileUtils.writeStringToFile(new File(outPathRawtext), rawtxt.toString(), StandardCharsets.UTF_8);

if (isNotBlank(fulltext)) {
Expand All @@ -1134,7 +1133,7 @@ public Document createTraining(File inputFile,
// write the TEI file to reflect the extact layout of the text as extracted from the pdf
writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator +
pdfFileName.replace(".pdf", ".training.segmentation.tei.xml")), false), StandardCharsets.UTF_8);
pdfFileName.replaceAll("(?i)\\.pdf$", ".training.segmentation.tei.xml")), false), StandardCharsets.UTF_8);
writer.write("<?xml version=\"1.0\" ?>\n<tei xml:space=\"preserve\">\n\t<teiHeader>\n\t\t<fileDesc xml:id=\"" + id +
"\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n");

Expand All @@ -1156,13 +1155,13 @@ public Document createTraining(File inputFile,
String raw = result.getRight();
if (tei != null) {
String outPath = pathTEI + "/" +
pdfFileName.replace(".pdf", ".training.references.referenceSegmenter.tei.xml");
pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.referenceSegmenter.tei.xml");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), StandardCharsets.UTF_8);
writer.write(tei + "\n");
writer.close();

// generate also the raw vector file with the features
outPath = pathTEI + "/" + pdfFileName.replace(".pdf", ".training.references.referenceSegmenter");
outPath = pathTEI + "/" + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.referenceSegmenter");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), StandardCharsets.UTF_8);
writer.write(raw + "\n");
writer.close();
Expand Down Expand Up @@ -1203,7 +1202,7 @@ public Document createTraining(File inputFile,

Writer writerReference = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator +
pdfFileName.replace(".pdf", ".training.references.tei.xml")), false), StandardCharsets.UTF_8);
pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.tei.xml")), false), StandardCharsets.UTF_8);

writerReference.write("<?xml version=\"1.0\" ?>\n<TEI xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\" " +
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" " +
Expand All @@ -1225,7 +1224,7 @@ public Document createTraining(File inputFile,
// BIBLIO REFERENCE AUTHOR NAMES
Writer writerName = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator +
pdfFileName.replace(".pdf", ".training.references.authors.tei.xml")), false), StandardCharsets.UTF_8);
pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.authors.tei.xml")), false), StandardCharsets.UTF_8);

writerName.write("<?xml version=\"1.0\" ?>\n<TEI xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\" " +
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" " +
Expand Down Expand Up @@ -1272,7 +1271,7 @@ public Document createTraining(File inputFile,

// we write the full text untagged
outPathFulltext = pathFullText + File.separator
+ pdfFileName.replace(".pdf", ".training.fulltext");
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.fulltext");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), StandardCharsets.UTF_8);
writer.write(bodytext + "\n");
writer.close();
Expand All @@ -1285,7 +1284,7 @@ public Document createTraining(File inputFile,
// write the TEI file to reflect the extract layout of the text as extracted from the pdf
writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator +
pdfFileName.replace(".pdf", ".training.fulltext.tei.xml")), false), StandardCharsets.UTF_8);
pdfFileName.replaceAll("(?i)\\.pdf$", ".training.fulltext.tei.xml")), false), StandardCharsets.UTF_8);
if (id == -1) {
writer.write("<?xml version=\"1.0\" ?>\n<tei xml:space=\"preserve\">\n\t<teiHeader/>\n\t<text xml:lang=\"en\">\n");
}
Expand All @@ -1301,13 +1300,13 @@ public Document createTraining(File inputFile,
Pair<String,String> trainingFigure = processTrainingDataFigures(rese, tokenizationsBody, inputFile.getName());
if (trainingFigure.getLeft().trim().length() > 0) {
String outPathFigures = pathFullText + File.separator
+ pdfFileName.replace(".pdf", ".training.figure");
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.figure");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFigures), false), StandardCharsets.UTF_8);
writer.write(trainingFigure.getRight() + "\n\n");
writer.close();

String outPathFiguresTEI = pathTEI + File.separator
+ pdfFileName.replace(".pdf", ".training.figure.tei.xml");
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.figure.tei.xml");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFiguresTEI), false), StandardCharsets.UTF_8);
writer.write(trainingFigure.getLeft() + "\n");
writer.close();
Expand All @@ -1317,13 +1316,13 @@ public Document createTraining(File inputFile,
Pair<String,String> trainingTable = processTrainingDataTables(rese, tokenizationsBody, inputFile.getName());
if (trainingTable.getLeft().trim().length() > 0) {
String outPathTables = pathFullText + File.separator
+ pdfFileName.replace(".pdf", ".training.table");
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.table");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTables), false), StandardCharsets.UTF_8);
writer.write(trainingTable.getRight() + "\n\n");
writer.close();

String outPathTablesTEI = pathTEI + File.separator
+ pdfFileName.replace(".pdf", ".training.table.tei.xml");
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.table.tei.xml");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTablesTEI), false), StandardCharsets.UTF_8);
writer.write(trainingTable.getLeft() + "\n");
writer.close();
Expand Down Expand Up @@ -1352,7 +1351,7 @@ public Document createTraining(File inputFile,

if ((header != null) && (header.trim().length() > 0)) {
// we write the header untagged
String outPathHeader = pathTEI + File.separator + pdfFileName.replace(".pdf", ".training.header");
String outPathHeader = pathTEI + File.separator + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header");
writer = new OutputStreamWriter(new FileOutputStream(new File(outPathHeader), false), StandardCharsets.UTF_8);
writer.write(header + "\n");
writer.close();
Expand Down Expand Up @@ -1467,9 +1466,9 @@ public Document createTraining(File inputFile,
// write the training TEI file for header which reflects the extract layout of the text as
// extracted from the pdf
writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator
+ pdfFileName.replace(".pdf", ".training.header.tei.xml")), false), StandardCharsets.UTF_8);
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.tei.xml")), false), StandardCharsets.UTF_8);
writer.write("<?xml version=\"1.0\" ?>\n<tei xml:space=\"preserve\">\n\t<teiHeader>\n\t\t<fileDesc xml:id=\""
+ pdfFileName.replace(".pdf", "")
+ pdfFileName.replaceAll("(?i)\\.pdf$", "")
+ "\"/>\n\t</teiHeader>\n\t<text");

if (lang != null) {
Expand All @@ -1486,7 +1485,7 @@ public Document createTraining(File inputFile,
if (bufferAffiliation.length() > 0) {
Writer writerAffiliation = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator
+ pdfFileName.replace(".pdf", ".training.header.affiliation.tei.xml")), false), StandardCharsets.UTF_8);
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.affiliation.tei.xml")), false), StandardCharsets.UTF_8);
writerAffiliation.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
writerAffiliation.write("\n<tei xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\""
+ " xmlns:xlink=\"http://www.w3.org/1999/xlink\" " + "xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">");
Expand All @@ -1507,7 +1506,7 @@ public Document createTraining(File inputFile,
if (bufferDate.length() > 0) {
Writer writerDate = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator
+ pdfFileName.replace(".pdf", ".training.header.date.xml")), false), StandardCharsets.UTF_8);
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.date.xml")), false), StandardCharsets.UTF_8);
writerDate.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
writerDate.write("<dates>\n");

Expand All @@ -1523,7 +1522,7 @@ public Document createTraining(File inputFile,
if (bufferName.length() > 0) {
Writer writerName = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator
+ pdfFileName.replace(".pdf", ".training.header.authors.tei.xml")), false), StandardCharsets.UTF_8);
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.authors.tei.xml")), false), StandardCharsets.UTF_8);
writerName.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
writerName.write("\n<tei xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\"" + " xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
+ "xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">");
Expand All @@ -1546,7 +1545,7 @@ public Document createTraining(File inputFile,
if (bufferReference.length() > 0) {
Writer writerReference = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator
+ pdfFileName.replace(".pdf", ".training.header.reference.xml")), false), StandardCharsets.UTF_8);
+ pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.reference.xml")), false), StandardCharsets.UTF_8);
writerReference.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
writerReference.write("<citations>\n");

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.grobid.core.engines;

import eugfc.imageio.plugins.PNMRegistry;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.io.FileUtils;
import org.grobid.core.GrobidModels;
import org.grobid.core.document.BasicStructureBuilder;
Expand Down Expand Up @@ -230,7 +231,7 @@ private void dealWithImages(DocumentSource documentSource, Document doc, File as
* Addition of the features at line level for the complete document.
* <p/>
* This is an alternative to the token level, where the unit for labeling is the line - so allowing faster
* processing and involving less features.
* processing and involving fewer features.
* Lexical features becomes line prefix and suffix, the feature text unit is the first 10 characters of the
* line without space.
* The dictionary flags are at line level (i.e. the line contains a name mention, a place mention, a year, etc.)
Expand Down Expand Up @@ -319,8 +320,9 @@ private String getFeatureVectorsAsString(Document doc, Map<String, Integer> patt
mm = 0;
//endPage = true;

if ((page.getBlocks() == null) || (page.getBlocks().size() == 0))
if (CollectionUtils.isEmpty(page.getBlocks())) {
continue;
}

for(int blockIndex=0; blockIndex < page.getBlocks().size(); blockIndex++) {
Block block = page.getBlocks().get(blockIndex);
Expand Down Expand Up @@ -444,7 +446,7 @@ private String getFeatureVectorsAsString(Document doc, Map<String, Integer> patt
if (text == null)
continue;

// final sanitisation and filtering
// final sanitization and filtering
text = text.replaceAll("[ \n\r]", "");
text = text.trim();

Expand Down Expand Up @@ -754,7 +756,7 @@ public void createBlankTrainingData(File file,

// we write the full text untagged (but featurized)
String outPathFulltext = pathFullText + File.separator +
PDFFileName.replace(".pdf", ".training.blank");
PDFFileName.replaceAll("(?i)\\.pdf$", ".training.blank");
Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8");
writer.write(fulltext + "\n");
writer.close();
Expand All @@ -770,7 +772,7 @@ public void createBlankTrainingData(File file,
// write the TEI file to reflect the extact layout of the text as extracted from the pdf
writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
File.separator +
PDFFileName.replace(".pdf", ".training.blank.tei.xml")), false), "UTF-8");
PDFFileName.replaceAll("(?i)\\.pdf$", ".training.blank.tei.xml")), false), "UTF-8");
writer.write("<?xml version=\"1.0\" ?>\n<tei xml:space=\"preserve\">\n\t<teiHeader>\n\t\t<fileDesc xml:id=\"f" + id +
"\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n");

Expand Down

0 comments on commit 23d4f25

Please sign in to comment.