Skip to content

Commit

Permalink
Merge pull request #1203 from kermitt2/elifesciencesfix-fulltext-bloc…
Browse files Browse the repository at this point in the history
…k-start

Fix fulltext block start
  • Loading branch information
lfoppiano authored Dec 26, 2024
2 parents 4c85ab0 + 671feb0 commit 24faed3
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,7 @@ else if (nbAuthorType > (bibDataSets.size() / 2))
}
}

boolean isFirstBlockToken = true;
while (n < lastPos) {
if (blockIndex == dp2.getBlockPtr()) {
//if (n > block.getEndToken()) {
Expand Down Expand Up @@ -969,7 +970,7 @@ else if (lineStartX - previousLineStartX > characterWidth)
features.alignmentStatus = "ALIGNEDLEFT";
}

if (n == 0) {
if (isFirstBlockToken) {
features.lineStatus = "LINESTART";
// be sure that previous token is closing a line, except if it's a starting line
if (previousFeatures != null) {
Expand Down Expand Up @@ -1146,6 +1147,7 @@ else if (features.blockStatus == null) {
mm += text.length();
nn += text.length();
previousFeatures = features;
isFirstBlockToken = false;
}
// lowest position of the block
lowestPos = block.getY() + block.getHeight();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.grobid.core.engines

import jnr.posix.BaseIovec.Layout
import org.apache.commons.lang3.StringUtils
import org.apache.commons.lang3.tuple.Triple
import org.grobid.core.analyzers.GrobidAnalyzer
import org.grobid.core.document.Document
import org.grobid.core.document.DocumentPiece
import org.grobid.core.document.DocumentPointer
import org.grobid.core.engines.label.TaggingLabels.TABLE_LABEL
import org.grobid.core.factory.GrobidFactory
import org.grobid.core.layout.LayoutToken
Expand All @@ -13,8 +15,8 @@ import org.grobid.core.utilities.GrobidProperties
import org.grobid.core.utilities.GrobidTestUtils
import org.hamcrest.CoreMatchers
import org.hamcrest.CoreMatchers.`is`
import org.hamcrest.MatcherAssert
import org.hamcrest.MatcherAssert.assertThat
import org.hamcrest.Matchers
import org.hamcrest.Matchers.hasSize
import org.hamcrest.collection.IsCollectionWithSize
import org.junit.AfterClass
Expand Down Expand Up @@ -88,8 +90,8 @@ class FullTextParserTest {
.map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] }
.collect(Collectors.joining(" "))

MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO"))
MatcherAssert.assertThat(
assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO"))
assertThat(
tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size,
CoreMatchers.`is`(13)
)
Expand Down Expand Up @@ -134,10 +136,10 @@ class FullTextParserTest {
}
}

MatcherAssert.assertThat<List<String>>(output, IsCollectionWithSize.hasSize(2))
MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )"))
MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO"))
MatcherAssert.assertThat(
assertThat<List<String>>(output, IsCollectionWithSize.hasSize(2))
assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )"))
assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO"))
assertThat(
tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size,
CoreMatchers.`is`(15)
)
Expand Down Expand Up @@ -177,8 +179,8 @@ class FullTextParserTest {
.map { l: String -> l.split("\t".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()[0] }
.collect(Collectors.joining(" "))

MatcherAssert.assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO"))
MatcherAssert.assertThat(
assertThat(reconstructedText, CoreMatchers.`is`("FIG . 1 . λ ( T ) vs . T for YBCO"))
assertThat(
tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size,
CoreMatchers.`is`(13)
)
Expand Down Expand Up @@ -223,10 +225,10 @@ class FullTextParserTest {
}
}

MatcherAssert.assertThat<List<String>>(output, IsCollectionWithSize.hasSize(2))
MatcherAssert.assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )"))
MatcherAssert.assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO"))
MatcherAssert.assertThat(
assertThat<List<String>>(output, IsCollectionWithSize.hasSize(2))
assertThat(output[0], CoreMatchers.`is`("FIG . 1 . λ ( T )"))
assertThat(output[1], CoreMatchers.`is`("vs . T for YBCO"))
assertThat(
tokenisation.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray().size,
CoreMatchers.`is`(15)
)
Expand Down Expand Up @@ -382,4 +384,58 @@ class FullTextParserTest {

assertThat(consolidatedTable3ResultCandidateThroughSequence, `is`(67))
}

@Test
@Throws(Exception::class)
fun testShouldOutputBlockStartForRegularBlock() {
val blockText = "This is a block"
val doc = Document.createFromText(blockText)
val documentParts = getWholeDocumentParts(doc)
val dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts)
// LOGGER.debug("data debug: {}", dataAndTokens.getLeft());
val lines = dataAndTokens.left.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()
assertThat(
"lines[0] fields",
Arrays.asList(
*lines[0].split("\\s".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()), `is`(Matchers.hasItem("BLOCKSTART"))
)
}

@Test
@Throws(Exception::class)
fun testShouldOutputBlockStartForBlockStartingWithLineFeed() {
val blockText = "\nThis is a block"
val doc = Document.createFromText(blockText)
assertThat(
"doc.block[0].tokens[0].text",
doc.blocks[0].getTokens()[0].text,
CoreMatchers.`is`("\n")
)
val documentParts = getWholeDocumentParts(doc)
val dataAndTokens = FullTextParser.getBodyTextFeatured(doc, documentParts)
// LOGGER.debug("data debug: {}", dataAndTokens.getLeft());
val lines = dataAndTokens.left.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()
assertThat(
"lines[0] fields",
Arrays.asList(*lines[0].split("\\s".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()),
`is`(Matchers.hasItem("BLOCKSTART"))
)
}

private fun getWholeDocumentPiece(doc: Document): DocumentPiece {
return DocumentPiece(
DocumentPointer(0, 0, 0),
DocumentPointer(0, doc.tokenizations.size - 1, doc.tokenizations.size - 1)
)
}

private fun getWholeDocumentParts(doc: Document): SortedSet<DocumentPiece> {
return TreeSet(
setOf(
getWholeDocumentPiece(doc)
)
)
}
}

0 comments on commit 24faed3

Please sign in to comment.