Skip to content

Commit

Permalink
fix header sequence with delft to revert any labels that do not have …
Browse files Browse the repository at this point in the history
…a valid I- starting label
  • Loading branch information
lfoppiano committed Dec 18, 2024
1 parent 21f85c9 commit 1fa792c
Show file tree
Hide file tree
Showing 3 changed files with 223 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,15 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
String res = null;
if (StringUtils.isNotBlank(header)) {
res = label(header);

if (GrobidProperties.getGrobidEngineName("header").equals("delft")) {
res = LabelUtils.postProcessFulltextCorrectSequencesWithoutInitialToken(res);
}
resHeader = resultExtraction(res, headerTokenization, resHeader);
}



// language identification
StringBuilder contentSample = new StringBuilder();
if (resHeader.getTitle() != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,49 @@ object LabelUtils {
return result.toString()
}

/**
* PostProcess the sequence of labels by reverting change of labels that are not starting with a initial sequence.
* For example, in a sequence of I-<abstract> <abstract> followed by <availability>,
* we revert all sequence of availability as <abstract>
*/
@JvmStatic
fun postProcessFulltextCorrectSequencesWithoutInitialToken(fulltextLabeledText: String): String {
val result = StringBuilder()

val lines = fulltextLabeledText
.split("\n".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()

var previousLabel: String? = null
for (i in lines.indices) {
val line = lines[i]
if (StringUtils.isBlank(line)) continue
val pieces = line
.split("\t".toRegex())
.dropLastWhile { it.isEmpty() }
.toTypedArray()
var label = pieces[pieces.size - 1]

if (!label.equals(TaggingLabels.OTHER_LABEL)) {
if (!label.startsWith("I-")) {
if (previousLabel != null && previousLabel != label) {
pieces[pieces.size - 1] = previousLabel
label = previousLabel
}
}
}

result.append(pieces.joinToString("\t"))
if (label == TaggingLabels.OTHER_LABEL) {
previousLabel = null
} else {
previousLabel = label.replace("I-", "")
}
result.append("\n")
}

return result.toString()
}

}
Loading

0 comments on commit 1fa792c

Please sign in to comment.