Skip to content

Commit

Permalink
Added a work around for the problem with apostrophes in the CoNLL for…
Browse files Browse the repository at this point in the history
…mat.
  • Loading branch information
MichaelRoeder committed Jan 25, 2024
1 parent 9e8c979 commit 55e481e
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 111 deletions.
218 changes: 111 additions & 107 deletions src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -219,115 +219,119 @@ protected List<Marking> processSingleDocument(List<String> linesOfCurrentDoc, St
whiteSpaceBehindPreviousToken = whiteSpaceBehindCurrentToken;
// split the columns
String[] token = tokenFull.split(columnSeparator);
// Check if the line has only one character
if (token[0].length() == 1) {
char ch = token[0].charAt(0); // Get the character
switch (ch) {
case '?': // falls through
case '!':
case ',':
case ')':
case ']':
case '}':
case ':':
case ';':
case '.':
case '#':
case '-':
case '=':
case '፠': // ፠ section mark // falls through
case '።': // ። full stop (period)
case '፣': // ፣ comma
case '፤': // ፤ semicolon
case '፥': // ፥ colon
case '፦': // ፦ preface colon
case '፧': // ፧ question mark
case '፨': // ፨ paragraph separator
{
whiteSpaceInFront = false;
whiteSpaceBehindCurrentToken = true;
break;
}
// General quotation characters (can be start or end)
// According to https://www.overleaf.com/learn/latex/Typesetting_quotations
case '"': // falls through
case '»': // Start in Danish, end in French, Russian, etc.
case '«': // Start in French, Russian, etc.; end in Danish
case '“': // Start in English, end in German, Lithuanian, Polish
{
// Toggle whiteSpaceBehind if the character is a quote mark
whiteSpaceInFront = !sawQuoteBefore;
whiteSpaceBehindCurrentToken = sawQuoteBefore;
sawQuoteBefore = !sawQuoteBefore;
break;
}
// English, UK ‘…’
// Start quotation characters
case '„': // Start in German, Lithuanian, Polish
case '‚': // Start in English
{
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = false;
sawQuoteBefore = true;
}
// End quotation characters
case '”': // End in a lot of languages
case '‘': // End in English
{
whiteSpaceInFront = false;
whiteSpaceBehindCurrentToken = true;
sawQuoteBefore = false;
}
case '(': // falls through
case '[':
case '{': {
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = false;
break;
}
case '፡': // ፡ word separator
default: {
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = true;
break;
}
}
// } else if (!Character.isLetterOrDigit(token[0].charAt(0))) {
// // Check if the first character of the line is not a letter or digit
// whiteSpaceInFront = false;
// // Set whiteSpaceInFront to false if the line starts with a non-letter or
// // non-digit character
} else {
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = true;
}
// Remove leading/trailing whitespaces and normalize spaces within the token
String normalizedToken = token[0].trim().replaceAll("\\s+", " ");

// If the current marking is not null AND there is no annotation column or there
// is no MARKING_INSIDE annotation --> The previous marking ended
if (currentMarking != null
&& (token.length <= annotationColumn || !token[annotationColumn].startsWith(MARKING_INSIDE))) {
currentMarking.setLength(currentText.length() - currentMarking.getStartPosition());
currentMarking = null;
}
// Ensure that there are tokens in the line and that the first token is not
// empty
if ((token.length > 0) && (token[0].length() > 0)) {
// Remove leading/trailing whitespaces and normalize spaces within the token
String normalizedToken = token[0].trim().replaceAll("\\s+", " ");
// Check again that the token is not empty
if (!normalizedToken.isEmpty()) {
// Check if the line has only one character
if (normalizedToken.length() == 1) {
char ch = normalizedToken.charAt(0); // Get the character
switch (ch) {
case '?': // falls through
case '!':
case ',':
case ')':
case ']':
case '}':
case ':':
case ';':
case '.':
case '#':
case '-':
case '=':
case '፠': // ፠ section mark
case '።': // ። full stop (period)
case '፣': // ፣ comma
case '፤': // ፤ semicolon
case '፥': // ፥ colon
case '፦': // ፦ preface colon
case '፧': // ፧ question mark
case '፨': // ፨ paragraph separator
{
whiteSpaceInFront = false;
whiteSpaceBehindCurrentToken = true;
break;
}
// General quotation characters (can be start or end)
// According to https://www.overleaf.com/learn/latex/Typesetting_quotations
case '"': // falls through
case '»': // Start in Danish, end in French, Russian, etc.
case '«': // Start in French, Russian, etc.; end in Danish
case '“': // Start in English, end in German, Lithuanian, Polish
{
// Toggle whiteSpaceBehind if the character is a quote mark
whiteSpaceInFront = !sawQuoteBefore;
whiteSpaceBehindCurrentToken = sawQuoteBefore;
sawQuoteBefore = !sawQuoteBefore;
break;
}
// English, UK ‘…’
// Start quotation characters
case '„': // Start in German, Lithuanian, Polish
case '‚': // Start in English
{
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = false;
sawQuoteBefore = true;
}
// End quotation characters
case '”': // End in a lot of languages
case '‘': // End in English
{
whiteSpaceInFront = false;
whiteSpaceBehindCurrentToken = true;
sawQuoteBefore = false;
}
case '(': // falls through
case '[':
case '{': {
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = false;
break;
}
case '፡': // ፡ word separator
default: {
whiteSpaceInFront = true;
whiteSpaceBehindCurrentToken = true;
break;
}
}
} else {
// Work around for situations with "I" followed by "'m" or "'ve": Only add a
// white space, if the token does not start with a '
whiteSpaceInFront = !normalizedToken.startsWith("'");
whiteSpaceBehindCurrentToken = true;
}

// Add the token from this line to the document's text
if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) {
currentText.append(whitespace);
} else {
currentText.append(nonWhitespace);
}
// If this line contains the start of a marking, we should keep track of it
if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) {
// Create new marking
currentMarking = createNewMarking(token, currentText.length());
markings.add(currentMarking);
// If the current marking is not null AND there is no annotation column or there
// is no MARKING_INSIDE annotation --> The previous marking ended
if (currentMarking != null && (token.length <= annotationColumn
|| !token[annotationColumn].startsWith(MARKING_INSIDE))) {
currentMarking.setLength(currentText.length() - currentMarking.getStartPosition());
currentMarking = null;
}

// Add the token from this line to the document's text
if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) {
currentText.append(whitespace);
} else {
currentText.append(nonWhitespace);
}
// If this line contains the start of a marking, we should keep track of it
if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) {
// Create new marking
currentMarking = createNewMarking(token, currentText.length());
markings.add(currentMarking);
}
// TODO 1. make the whitespace configurable to allow other word separators 2.
// Remove the previous word separator if we have a punctuation character.
// (quotation, apostrophe)
currentText.append(normalizedToken);
}
}
// TODO 1. make the whitespace configurable to allow other word separators 2.
// Remove the previous word separator if we have a punctuation character.
// (quotation, apostrophe)
currentText.append(normalizedToken);
}
// If there is an unfinished marking, finalize it
if (currentMarking != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ public static Collection<Object[]> data() {
// solution (e.g. "I 'm")
testConfigs.add(new Object[] {
"@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O",
"@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
new TypedSpanImpl(63, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 });
"@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
new TypedSpanImpl(61, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 });
testConfigs.add(new Object[] {
"@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O",
"@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
new TypedSpanImpl(86, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 });
"@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
new TypedSpanImpl(84, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 });
testConfigs.add(new Object[] {
"@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O",
"From Green Newsfeed: AHFA extends deadline for Sage Award to Nov. 5 http://tinyurl.com/24agj38",
Expand Down

0 comments on commit 55e481e

Please sign in to comment.