Skip to content

Commit

Permalink
fix LINEBLOCKSTARTS for new generated files #712
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Nov 21, 2024
1 parent f4d6245 commit 6c1e1e6
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,7 @@ else if (nbAuthorType > (bibDataSets.size() / 2))
}
}

boolean isFirstBlockToken = true;
while (n < lastPos) {
if (blockIndex == dp2.getBlockPtr()) {
//if (n > block.getEndToken()) {
Expand Down Expand Up @@ -842,7 +843,7 @@ else if (lineStartX - previousLineStartX > characterWidth)
features.alignmentStatus = "ALIGNEDLEFT";
}

if (n == 0) {
if (isFirstBlockToken) {
features.lineStatus = "LINESTART";
// be sure that previous token is closing a line, except if it's a starting line
if (previousFeatures != null) {
Expand Down Expand Up @@ -1019,6 +1020,7 @@ else if (features.blockStatus == null) {
mm += text.length();
nn += text.length();
previousFeatures = features;
isFirstBlockToken = false;
}
// lowest position of the block
lowestPos = block.getY() + block.getHeight();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,25 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentPiece;
import org.grobid.core.document.DocumentPointer;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.main.LibraryLoader;
import org.grobid.core.utilities.GrobidProperties;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;

public class FullTextParserTest {
Expand All @@ -41,6 +44,46 @@ public static void tearDown() {
GrobidFactory.reset();
}

public DocumentPiece getWholeDocumentPiece(Document doc) {
return new DocumentPiece(
new DocumentPointer(0, 0, 0),
new DocumentPointer(0, doc.getTokenizations().size() - 1, doc.getTokenizations().size() - 1)
);
}

public SortedSet<DocumentPiece> getWholeDocumentParts(Document doc) {
return new TreeSet<>(Collections.singleton(
getWholeDocumentPiece(doc)
));
}

@Test
public void testShouldOutputBlockStartForRegularBlock() throws Exception {
String blockText = "This is a block";
Document doc = Document.createFromText(blockText);
SortedSet<DocumentPiece> documentParts = getWholeDocumentParts(doc);
Pair<String, LayoutTokenization> dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts);
// LOGGER.debug("data debug: {}", dataAndTokens.getLeft());
String[] lines = dataAndTokens.getLeft().split("\n");
assertThat("lines[0] fields", Arrays.asList(lines[0].split("\\s")), is(hasItem("BLOCKSTART")));
}

@Test
public void testShouldOutputBlockStartForBlockStartingWithLineFeed() throws Exception {
String blockText = "\nThis is a block";
Document doc = Document.createFromText(blockText);
assertThat(
"doc.block[0].tokens[0].text",
doc.getBlocks().get(0).getTokens().get(0).getText(),
is("\n")
);
SortedSet<DocumentPiece> documentParts = getWholeDocumentParts(doc);
Pair<String, LayoutTokenization> dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts);
// LOGGER.debug("data debug: {}", dataAndTokens.getLeft());
String[] lines = dataAndTokens.getLeft().split("\n");
assertThat("lines[0] fields", Arrays.asList(lines[0].split("\\s")), is(hasItem("BLOCKSTART")));
}

@Test
public void testProcessTrainingDataFigures_single_figure() throws Exception {
String text = "The mechanism for superconductivity FIG. 1. λ(T) vs . T for YBCO";
Expand Down

0 comments on commit 6c1e1e6

Please sign in to comment.