Added a work around for the problem with apostrophes in the CoNLL for…

…mat.
dice-group · Jan 25, 2024 · 55e481e · 55e481e
1 parent 9e8c979
commit 55e481e
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 111 deletions.
diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java
@@ -219,115 +219,119 @@ protected List<Marking> processSingleDocument(List<String> linesOfCurrentDoc, St
             whiteSpaceBehindPreviousToken = whiteSpaceBehindCurrentToken;
             // split the columns
             String[] token = tokenFull.split(columnSeparator);
-            // Check if the line has only one character
-            if (token[0].length() == 1) {
-                char ch = token[0].charAt(0); // Get the character
-                switch (ch) {
-                case '?': // falls through
-                case '!':
-                case ',':
-                case ')':
-                case ']':
-                case '}':
-                case ':':
-                case ';':
-                case '.':
-                case '#':
-                case '-':
-                case '=':
-                case '፠': // ፠ section mark // falls through
-                case '።': // ። full stop (period)
-                case '፣': // ፣ comma
-                case '፤': // ፤ semicolon
-                case '፥': // ፥ colon
-                case '፦': // ፦ preface colon
-                case '፧': // ፧ question mark
-                case '፨': // ፨ paragraph separator
-                {
-                    whiteSpaceInFront = false;
-                    whiteSpaceBehindCurrentToken = true;
-                    break;
-                }
-                // General quotation characters (can be start or end)
-                // According to https://www.overleaf.com/learn/latex/Typesetting_quotations
-                case '"': // falls through
-                case '»': // Start in Danish, end in French, Russian, etc.
-                case '«': // Start in French, Russian, etc.; end in Danish
-                case '“': // Start in English, end in German, Lithuanian, Polish
-                {
-                    // Toggle whiteSpaceBehind if the character is a quote mark
-                    whiteSpaceInFront = !sawQuoteBefore;
-                    whiteSpaceBehindCurrentToken = sawQuoteBefore;
-                    sawQuoteBefore = !sawQuoteBefore;
-                    break;
-                }
-                // English, UK ‘…’
-                // Start quotation characters
-                case '„': // Start in German, Lithuanian, Polish
-                case '‚': // Start in English
-                {
-                    whiteSpaceInFront = true;
-                    whiteSpaceBehindCurrentToken = false;
-                    sawQuoteBefore = true;
-                }
-                // End quotation characters
-                case '”': // End in a lot of languages
-                case '‘': // End in English
-                {
-                    whiteSpaceInFront = false;
-                    whiteSpaceBehindCurrentToken = true;
-                    sawQuoteBefore = false;
-                }
-                case '(': // falls through
-                case '[':
-                case '{': {
-                    whiteSpaceInFront = true;
-                    whiteSpaceBehindCurrentToken = false;
-                    break;
-                }
-                case '፡': // ፡ word separator
-                default: {
-                    whiteSpaceInFront = true;
-                    whiteSpaceBehindCurrentToken = true;
-                    break;
-                }
-                }
-//                } else if (!Character.isLetterOrDigit(token[0].charAt(0))) {
-//                    // Check if the first character of the line is not a letter or digit
-//                    whiteSpaceInFront = false;
-//                    // Set whiteSpaceInFront to false if the line starts with a non-letter or
-//                    // non-digit character
-            } else {
-                whiteSpaceInFront = true;
-                whiteSpaceBehindCurrentToken = true;
-            }
-            // Remove leading/trailing whitespaces and normalize spaces within the token
-            String normalizedToken = token[0].trim().replaceAll("\\s+", " ");
-
-            // If the current marking is not null AND there is no annotation column or there
-            // is no MARKING_INSIDE annotation --> The previous marking ended
-            if (currentMarking != null
-                    && (token.length <= annotationColumn || !token[annotationColumn].startsWith(MARKING_INSIDE))) {
-                currentMarking.setLength(currentText.length() - currentMarking.getStartPosition());
-                currentMarking = null;
-            }
+            // Ensure that there are tokens in the line and that the first token is not
+            // empty
+            if ((token.length > 0) && (token[0].length() > 0)) {
+                // Remove leading/trailing whitespaces and normalize spaces within the token
+                String normalizedToken = token[0].trim().replaceAll("\\s+", " ");
+                // Check again that the token is not empty
+                if (!normalizedToken.isEmpty()) {
+                    // Check if the line has only one character
+                    if (normalizedToken.length() == 1) {
+                        char ch = normalizedToken.charAt(0); // Get the character
+                        switch (ch) {
+                        case '?': // falls through
+                        case '!':
+                        case ',':
+                        case ')':
+                        case ']':
+                        case '}':
+                        case ':':
+                        case ';':
+                        case '.':
+                        case '#':
+                        case '-':
+                        case '=':
+                        case '፠': // ፠ section mark
+                        case '።': // ። full stop (period)
+                        case '፣': // ፣ comma
+                        case '፤': // ፤ semicolon
+                        case '፥': // ፥ colon
+                        case '፦': // ፦ preface colon
+                        case '፧': // ፧ question mark
+                        case '፨': // ፨ paragraph separator
+                        {
+                            whiteSpaceInFront = false;
+                            whiteSpaceBehindCurrentToken = true;
+                            break;
+                        }
+                        // General quotation characters (can be start or end)
+                        // According to https://www.overleaf.com/learn/latex/Typesetting_quotations
+                        case '"': // falls through
+                        case '»': // Start in Danish, end in French, Russian, etc.
+                        case '«': // Start in French, Russian, etc.; end in Danish
+                        case '“': // Start in English, end in German, Lithuanian, Polish
+                        {
+                            // Toggle whiteSpaceBehind if the character is a quote mark
+                            whiteSpaceInFront = !sawQuoteBefore;
+                            whiteSpaceBehindCurrentToken = sawQuoteBefore;
+                            sawQuoteBefore = !sawQuoteBefore;
+                            break;
+                        }
+                        // English, UK ‘…’
+                        // Start quotation characters
+                        case '„': // Start in German, Lithuanian, Polish
+                        case '‚': // Start in English
+                        {
+                            whiteSpaceInFront = true;
+                            whiteSpaceBehindCurrentToken = false;
+                            sawQuoteBefore = true;
+                        }
+                        // End quotation characters
+                        case '”': // End in a lot of languages
+                        case '‘': // End in English
+                        {
+                            whiteSpaceInFront = false;
+                            whiteSpaceBehindCurrentToken = true;
+                            sawQuoteBefore = false;
+                        }
+                        case '(': // falls through
+                        case '[':
+                        case '{': {
+                            whiteSpaceInFront = true;
+                            whiteSpaceBehindCurrentToken = false;
+                            break;
+                        }
+                        case '፡': // ፡ word separator
+                        default: {
+                            whiteSpaceInFront = true;
+                            whiteSpaceBehindCurrentToken = true;
+                            break;
+                        }
+                        }
+                    } else {
+                        // Work around for situations with "I" followed by "'m" or "'ve": Only add a
+                        // white space, if the token does not start with a '
+                        whiteSpaceInFront = !normalizedToken.startsWith("'");
+                        whiteSpaceBehindCurrentToken = true;
+                    }
 
-            // Add the token from this line to the document's text
-            if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) {
-                currentText.append(whitespace);
-            } else {
-                currentText.append(nonWhitespace);
-            }
-            // If this line contains the start of a marking, we should keep track of it
-            if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) {
-                // Create new marking
-                currentMarking = createNewMarking(token, currentText.length());
-                markings.add(currentMarking);
+                    // If the current marking is not null AND there is no annotation column or there
+                    // is no MARKING_INSIDE annotation --> The previous marking ended
+                    if (currentMarking != null && (token.length <= annotationColumn
+                            || !token[annotationColumn].startsWith(MARKING_INSIDE))) {
+                        currentMarking.setLength(currentText.length() - currentMarking.getStartPosition());
+                        currentMarking = null;
+                    }
+
+                    // Add the token from this line to the document's text
+                    if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) {
+                        currentText.append(whitespace);
+                    } else {
+                        currentText.append(nonWhitespace);
+                    }
+                    // If this line contains the start of a marking, we should keep track of it
+                    if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) {
+                        // Create new marking
+                        currentMarking = createNewMarking(token, currentText.length());
+                        markings.add(currentMarking);
+                    }
+                    // TODO 1. make the whitespace configurable to allow other word separators 2.
+                    // Remove the previous word separator if we have a punctuation character.
+                    // (quotation, apostrophe)
+                    currentText.append(normalizedToken);
+                }
             }
-            // TODO 1. make the whitespace configurable to allow other word separators 2.
-            // Remove the previous word separator if we have a punctuation character.
-            // (quotation, apostrophe)
-            currentText.append(normalizedToken);
         }
         // If there is an unfinished marking, finalize it
         if (currentMarking != null) {

diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java
@@ -50,12 +50,12 @@ public static Collection<Object[]> data() {
         // solution (e.g. "I 'm")
         testConfigs.add(new Object[] {
                 "@paulwalk	O\nIt	O\n's	O\nthe	O\nview	O\nfrom	O\nwhere	O\nI	O\n'm	O\nliving	O\nfor	O\ntwo	O\nweeks	O\n.	O\nEmpire	B-facility\nState	I-facility\nBuilding	I-facility\n=	O\nESB	B-facility\n.	O\nPretty	O\nbad	O\nstorm	O\nhere	O\nlast	O\nevening	O\n.	O\n	\nFrom	O\nGreen	O\nNewsfeed	O\n:	O\nAHFA	B-other\nextends	O\ndeadline	O\nfor	O\nSage	B-other\nAward	I-other\nto	O\nNov	O\n.	O\n5	O\nhttp://tinyurl.com/24agj38	O",
-                "@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
-                new TypedSpanImpl(63, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 });
+                "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
+                new TypedSpanImpl(61, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 });
         testConfigs.add(new Object[] {
                 "@paulwalk	O\nIt	O\n's	O\nthe	O\nview	O\nfrom	O\nwhere	O\nI	O\n'm	O\nliving	O\nfor	O\ntwo	O\nweeks	O\n.	O\nEmpire	B-facility\nState	I-facility\nBuilding	I-facility\n=	O\nESB	B-facility\n.	O\nPretty	O\nbad	O\nstorm	O\nhere	O\nlast	O\nevening	O\n.	O\n	\nFrom	O\nGreen	O\nNewsfeed	O\n:	O\nAHFA	B-other\nextends	O\ndeadline	O\nfor	O\nSage	B-other\nAward	I-other\nto	O\nNov	O\n.	O\n5	O\nhttp://tinyurl.com/24agj38	O",
-                "@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
-                new TypedSpanImpl(86, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 });
+                "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.",
+                new TypedSpanImpl(84, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 });
         testConfigs.add(new Object[] {
                 "@paulwalk	O\nIt	O\n's	O\nthe	O\nview	O\nfrom	O\nwhere	O\nI	O\n'm	O\nliving	O\nfor	O\ntwo	O\nweeks	O\n.	O\nEmpire	B-facility\nState	I-facility\nBuilding	I-facility\n=	O\nESB	B-facility\n.	O\nPretty	O\nbad	O\nstorm	O\nhere	O\nlast	O\nevening	O\n.	O\n	\nFrom	O\nGreen	O\nNewsfeed	O\n:	O\nAHFA	B-other\nextends	O\ndeadline	O\nfor	O\nSage	B-other\nAward	I-other\nto	O\nNov	O\n.	O\n5	O\nhttp://tinyurl.com/24agj38	O",
                 "From Green Newsfeed: AHFA extends deadline for Sage Award to Nov. 5 http://tinyurl.com/24agj38",