From 941d472c7ccba978d5e0cdca2480c1d2d6dc9de5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 19 Dec 2024 16:12:34 +0100 Subject: [PATCH] allow a loose approach to gather table/figure starting token when there is likely more figures/tables obtained by the specific models as in contrast with the initial fulltext sequence --- .../grobid/core/engines/FullTextParser.java | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 3316a9f769..3b4e273708 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -374,10 +374,15 @@ static String revertResultsForBadItems(List badFiguresOrTables .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); + long numberItems = labelledResultsAsList.stream() + .filter(r -> Iterables.getLast(r).startsWith("I-" + itemLabel)) + .count(); + for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization List layoutTokenItem = badItem.getLayoutTokens(); - List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, itemLabel); + List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, + itemLabel, !(badFiguresOrTables.size() > numberItems)); if (candidateIndexes.isEmpty()) { LOGGER.info("Cannot find the candidate index for fixing the tables."); continue; @@ -447,20 +452,37 @@ static int consolidateResultCandidateThroughSequence(List candidateInde return resultIndexCandidate; } + /** + * Find a set of candidates representing the indexes from the labelledResults which could correspond + * to the first token of the figure/table + * + * strict = True check the I- or I-
first and then the
or
only if there are not candidates + * strict = False is usually necessary if there are more tables than I- token, this because a figure/table could be + * identified within the sequence initially provided by the fulltext model + * + */ @NotNull static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel) { + return findCandidateIndex(layoutTokenItem, labelledResultsAsList, itemLabel, true); + } + + @NotNull + static List findCandidateIndex(List layoutTokenItem, List> labelledResultsAsList, String itemLabel, boolean strict) { LayoutToken firstLayoutTokenItem = layoutTokenItem.get(0); List candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) - && Iterables.getLast(labelledResultsAsList.get(i)).equals("I-"+ itemLabel)) + && Iterables.getLast(labelledResultsAsList.get(i)).equals("I-" + itemLabel)) .boxed() .collect(Collectors.toList()); - if (candidateIndexes.isEmpty()) { + if (candidateIndexes.isEmpty() || !strict) { candidateIndexes = IntStream.range(0, labelledResultsAsList.size()) .filter(i -> labelledResultsAsList.get(i).get(0).equals(firstLayoutTokenItem.getText()) - && Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel)) + && ( + Iterables.getLast(labelledResultsAsList.get(i)).equals(itemLabel) + || Iterables.getLast(labelledResultsAsList.get(i)).equals("I-" + itemLabel)) + ) .boxed() .collect(Collectors.toList()); } @@ -2247,10 +2269,12 @@ protected Pair processTrainingDataFigures(String rese, // If there still an open figure if (openFigure) { - while((tokenizationsFigure.size() > 0) && + while(CollectionUtils.isNotEmpty(tokenizationsFigure) && (tokenizationsFigure.get(0).getText().equals("\n") || - tokenizationsFigure.get(0).getText().equals(" ")) ) + tokenizationsFigure.get(0).getText().equals(" ")) + ) { tokenizationsFigure.remove(0); + } // process the "accumulated" figure Pair trainingData = parsers.getFigureParser()