From 207a1aba9662b5403364760e7ebfe200e491a34c Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:00:11 +0200 Subject: [PATCH 01/25] masakha:handle punctuation of languages --- .../impl/conll/CoNLLTypeRetriever.java | 4 +- .../impl/conll/GenericCoNLLDataset.java | 59 ++++++++++++++++++- .../impl/derczysnki/DerczynskiDataset.java | 2 +- .../dataset/impl/indqner/IndQNERDataset.java | 2 +- .../dataset/impl/ritter/RitterDataset.java | 2 +- .../gerbil/dataset/impl/umbc/UMBCDataset.java | 2 +- .../gerbil/dataset/impl/wnut/WNUTDataset.java | 2 +- 7 files changed, 66 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java index 0257f0cc4..2ff118ac1 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java @@ -36,11 +36,12 @@ public class CoNLLTypeRetriever { private static final String SPORTS_TEAM_URI = "http://dbpedia.org/ontology/SportsTeam"; private static final String TV_SHOW_URI = "http://dbpedia.org/ontology/TelevisionShow"; private static final String ORGANISATION_URI = "http://dbpedia.org/ontology/Organisation"; + private static final String LANGUAGE = "https://dbpedia.org/ontology/language"; private Map annotationToType; public CoNLLTypeRetriever(String place, String company, String film, String musicalArtist, String unknown, - String person, String product, String sportsTeam, String tvShow, String organisation) { + String person, String product, String sportsTeam, String tvShow, String organisation, String language) { annotationToType = new HashMap<>(); annotationToType.put(place, PLACE_URI); annotationToType.put(company, COMPANY_URI); @@ -52,6 +53,7 @@ public CoNLLTypeRetriever(String place, String company, String film, String musi annotationToType.put(sportsTeam, SPORTS_TEAM_URI); annotationToType.put(tvShow, TV_SHOW_URI); annotationToType.put(organisation, ORGANISATION_URI); + annotationToType.put(language, LANGUAGE); } /** diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 545ee85d6..0b02b7061 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -90,6 +90,8 @@ public class GenericCoNLLDataset extends AbstractDataset implements Initializabl * it is set to -1. */ protected int uriColumn; + + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); @@ -141,6 +143,10 @@ protected List loadDocuments(File file) throws GerbilException { // Create namespace for the documents of this dataset String documentUriPrefix = "http://" + getName() + "/"; StringBuilder textOfCurrentDocument = new StringBuilder(); + // Flag to track if a whitespace should be inserted in front of a line + boolean whiteSpaceInFront = true; + // Flag to track if a whitespace should be inserted behind a line + boolean whiteSpaceBehind = true; try (BufferedReader reader = new BufferedReader( new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));) { String line = reader.readLine(); @@ -165,9 +171,60 @@ protected List loadDocuments(File file) throws GerbilException { // Increase the document ID index++; } - } else { + } else { + if ((textOfCurrentDocument.length() > 0) && (line.length() >= 1)) { + if (line.length() == 1) { // Check if the line has only one character + char ch = line.charAt(0); // Get the character + switch (ch) { + case '?': + case '!': + case ',': + case ')': + case ']': + case '}': + case '.': + case '፠': // ፠ section mark + case '፡': // ፡ word separator + case '።': // ። full stop (period) + case '፣': // ፣ comma + case '፤': // ፤ semicolon + case '፥': // ፥ colon + case '፦': // ፦ preface colon + case '፧': // ፧ question mark + case '፨': // ፨ paragraph separator + // Set whiteSpaceInFront to false if the character is a punctuation mark + // that does not require a whitespace in front + whiteSpaceInFront = false; + break; + case '"': + // Toggle whiteSpaceBehind if the character is a quote mark + whiteSpaceBehind = !whiteSpaceBehind; + break; + case '(': + case '[': + case '{': + // Set whiteSpaceBehind to false if the character is an opening parenthesis or bracket + whiteSpaceBehind = false; + break; + default: + break; + } + + } + else if (!Character.isLetterOrDigit(line.charAt(0))) { + // Check if the first character of the line is not a letter or digit + whiteSpaceInFront = false; + // Set whiteSpaceInFront to false if the line starts with a non-letter or non-digit character + } + if (whiteSpaceInFront) { + textOfCurrentDocument.append(' '); + // Append a whitespace to separate the current line from the previous content + } + } // Add the current line to the list of lines of the current document linesOfCurrentDoc.add(line); + // Append the current line to the text of the current document + textOfCurrentDocument.append(line); } // Read the next line line = reader.readLine(); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java index 1aa2105e8..79fa82568 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java @@ -24,7 +24,7 @@ public class DerczynskiDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 2; private static final int URI_COLUMN = 1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("geo-loc", "company", "movie", "musicartist", - "other", "person", "product", "sportsteam", "tv-show", null); + "other", "person", "product", "sportsteam", "tv-show", null, null); public DerczynskiDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java index 93b49bdfd..3561cf75f 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java @@ -16,7 +16,7 @@ public class IndQNERDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("GeographicalLocation", null, null, null, - null, "Person", null, null, null, null); + null, "Person", null, null, null, null, null); public IndQNERDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java index 2595960a5..c3ac0a57c 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java @@ -24,7 +24,7 @@ public class RitterDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("geo-loc", null, "movie", "musicartist", - "other", "person", "product", "sportsteam", "tvshow", null); + "other", "person", "product", "sportsteam", "tvshow", null, null); public RitterDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java index 15b740551..136a26f40 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java @@ -24,7 +24,7 @@ public class UMBCDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("LOC", null, null, null, - null, "PER", null, null, null, "ORG"); + null, "PER", null, null, null, "ORG", null); public UMBCDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java index 818c69113..f1b5a7809 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java @@ -27,7 +27,7 @@ public class WNUTDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("location", "corporation", null, null, - null, "person", "product", null, null, null); + null, "person", "product", null, null, null, null); public WNUTDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); From 49987d0922175f8ff87fb1ef9e12903b389c07fd Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:02:00 +0200 Subject: [PATCH 02/25] added languages of masakha 1 --- .../impl/masakha/MasakhaNERDataset.java | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java new file mode 100644 index 000000000..da2fc862c --- /dev/null +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -0,0 +1,60 @@ +package org.aksw.gerbil.dataset.impl.masakha; + +import org.aksw.gerbil.dataset.impl.conll.GenericCoNLLDataset; +import org.aksw.gerbil.dataset.impl.conll.CoNLLTypeRetriever; + +public class MasakhaNERDataset extends GenericCoNLLDataset { + private static final int ANNOTATION_COLUMN = 2; + private static final int URI_COLUMN = 1; + private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever( + "GeographicalLocation", null, null, null, null, null, null, + null, null, null, "language"); + private String languageCode; + + public MasakhaNERDataset(String file) { + super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); + } + + /** + * Sets the language code for the dataset. + * + * @param language the language code (ISO 639-2) + */ + public void setLanguage(String language) { + languageCode = language.toLowerCase(); + + switch (languageCode) { + case "amh": + TYPE_TAGS.addTypeURI("amharic_type", "http://dbpedia.org/resource/Amharic_language"); + break; + case "hau": + TYPE_TAGS.addTypeURI("hausa_type", "http://dbpedia.org/resource/Igbo_language"); + break; + case "ibo": + TYPE_TAGS.addTypeURI("igbo_type", "http://dbpedia.org/resource/Kinyarwanda_language"); + break; + case "kin": + TYPE_TAGS.addTypeURI("kinyarwanda_type", "http://dbpedia.org/resource/Kinyarwanda_language"); + break; + case "lug": + TYPE_TAGS.addTypeURI("luganda_type", "http://dbpedia.org/resource/Luganda_language"); + break; + case "lu": + TYPE_TAGS.addTypeURI("luo_type", "http://dbpedia.org/resource/Luo_language"); + break; + case "pcm": + TYPE_TAGS.addTypeURI("pidgin_type", "http://dbpedia.org/resource/Nigerian_Pidgin"); + break; + case "swa": + TYPE_TAGS.addTypeURI("swahili_type", "http://dbpedia.org/resource/Swahili_language"); + break; + case "wol": + TYPE_TAGS.addTypeURI("wolof_type", "http://dbpedia.org/resource/Wolof_language"); + break; + case "yor": + TYPE_TAGS.addTypeURI("yoruba_type", "http://dbpedia.org/resource/Yoruba_language"); + break; + } + } +} + From bd6ec130a5f634548b0886fd22a8faa60449f4ab Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:03:44 +0200 Subject: [PATCH 03/25] added test case for languages of masakha 1 --- .../impl/masakha/MasakhaNERDatasetTest.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java new file mode 100644 index 000000000..38e47cd17 --- /dev/null +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -0,0 +1,74 @@ +package org.aksw.gerbil.dataset.impl.masakha; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.aksw.gerbil.dataset.InitializableDataset; +import org.aksw.gerbil.dataset.impl.conll.AbstractGenericCoNLLDatasetTest; +import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.data.TypedSpanImpl; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class MasakhaNERDatasetTest extends AbstractGenericCoNLLDatasetTest { + + public MasakhaNERDatasetTest(String fileContent, String text, Marking expectedMarking, int documentId, + int markingId) { + super(fileContent, text, expectedMarking, documentId, markingId); + } + + @Override + public InitializableDataset createDataset(File file) { + return new MasakhaNERDataset(file.getAbsolutePath()); + } + + @Parameterized.Parameters + public static Collection data() { + List testConfigs = new ArrayList<>(); + // Amharic language + testConfigs.add(new Object[] { + "እንደ O\n ኤለርስ B-LOC\n አብዛኞቹ O\n የአፍሪቃ B-LOC\n ሀገሮች O\n በአብዛኛዉ O\n የሥራ O\n ቦታ O\n ያለዉ O\n እርሻ O\n ላይ O\n ነዉ O\n ። O", + new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Amharic_language"), 0, 0 }); + // Hausa language + testConfigs.add(new Object[] { + "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", + new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Igbo_language"), 1, 0 }); + // Igbo language + testConfigs.add(new Object[] { + "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", + new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 2, 0 }); + // Kinyarwanda language + testConfigs.add(new Object[] { + "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", + new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 3, 0 }); + // Luganda language + testConfigs.add(new Object[] { + "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", + new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Luganda_language"), 4, 0 }); + // Luo language + testConfigs.add(new Object[] { + "Kwan O\n jii O\n maromo O\n 796 O\n mane O\n oyudi O\n ni O\n nigi O\n Covid O\n - O\n 19 O\n ei O\n kawuononi B-DATE", + new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Luo_language"), 5, 0 }); + // Nigerian Pidgin language + testConfigs.add(new Object[] { + "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n", ", O\n", "Florida B-LOC\n", ". O", + new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Nigerian_Pidgin"), 6, 0 }); + // Swahili language + testConfigs.add(new Object[] { + "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", + new TypedSpanImpl(3, 2, "http://dbpedia.org/resource/Swahili_language"), 7, 0 }); + // Wolof language + testConfigs.add(new Object[] { + "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", + new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Wolof_language"), 8, 0 }); + // Yoruba language + testConfigs.add(new Object[] { + "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", + new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Yoruba_language"), 9, 0 }); + return testConfigs; + } + +} From 1225cf8db50482dc3be6b38542fb8dd761f8005b Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:05:36 +0200 Subject: [PATCH 04/25] masakha: added dataset.properties of languages --- src/main/properties/datasets.properties | 165 ++++++++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/src/main/properties/datasets.properties b/src/main/properties/datasets.properties index 31d5fce86..6740aab06 100644 --- a/src/main/properties/datasets.properties +++ b/src/main/properties/datasets.properties @@ -313,6 +313,171 @@ org.aksw.gerbil.datasets.definition.KORE50.cacheable=true org.aksw.gerbil.datasets.definition.KORE50.experimentType=A2KB org.aksw.gerbil.datasets.definition.KORE50.constructorArgs=${org.aksw.gerbil.datasets.KORE50.file},${org.aksw.gerbil.datasets.definition.KORE50.name} + + +### Masakha +org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir=${org.aksw.gerbil.DataPath}/datasets/masakha +### Amharic +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.name=MasakhaNER-Amharic-Dev +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.name=MasakhaNER-Amharic-Test +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.name=MasakhaNER-Amharic-Train +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.experimentType=RT2KB +### Hausa +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.name=MasakhaNER-Hausa-Dev +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.name=MasakhaNER-Hausa-Test +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.name=MasakhaNER-Hausa-Train +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.experimentType=RT2KB +### Igbo +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.name=MasakhaNER-Igbo-Dev +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.name=MasakhaNER-Igbo-Test +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.name=MasakhaNER-Igbo-Train +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.experimentType=RT2KB +### Kinyarwanda +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.name=MasakhaNER-Kinyarwanda-Dev +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.name=MasakhaNER-Kinyarwanda-Test +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.name=MasakhaNER-Kinyarwanda-Train +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.experimentType=RT2KB +### Luganda +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.name=MasakhaNER-Luganda-Dev +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.name=MasakhaNER-Luganda-Test +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.name=MasakhaNER-Luganda-Train +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.experimentType=RT2KB +### Luo +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.name=MasakhaNER-Luo-Dev +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.name=MasakhaNER-Luo-Test +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.name=MasakhaNER-Luo-Train +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.experimentType=RT2KB +### Naija +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.name=MasakhaNER-Naija-Dev +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.name=MasakhaNER-Naija-Test +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.name=MasakhaNER-Naija-Train +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.experimentType=RT2KB +### Swahili +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.name=MasakhaNER-Swahili-Dev +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.name=MasakhaNER-Swahili-Test +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.name=MasakhaNER-Swahili-Train +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.experimentType=RT2KB +### Wolof +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.name=MasakhaNER-Wolof-Dev +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.name=MasakhaNER-Wolof-Test +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.name=MasakhaNER-Wolof-Train +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.experimentType=RT2KB +### Yoruba +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.name=MasakhaNER-Yoruba-Dev +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.name=MasakhaNER-Yoruba-Test +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.name=MasakhaNER-Yoruba-Train +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=RT2KB + ### Meij #org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list #org.aksw.gerbil.datasets.MeijDatasetConfig.tagsFile=${org.aksw.gerbil.DataPath}/datasets/meij/wsdm2012_annotations.txt From ea44b7f27d7f7d06538069678a6cce1acea412a2 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:06:22 +0200 Subject: [PATCH 05/25] added languages of masakha 2 --- .../impl/masakha/MasakhaNERDataset.java | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java index da2fc862c..dc40a8af6 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -54,6 +54,39 @@ public void setLanguage(String language) { case "yor": TYPE_TAGS.addTypeURI("yoruba_type", "http://dbpedia.org/resource/Yoruba_language"); break; + case "bam": + TYPE_TAGS.addTypeURI("bambara_type", "http://dbpedia.org/resource/Bambara_language"); + break; + case "ewe": + TYPE_TAGS.addTypeURI("ewe_type", "http://dbpedia.org/resource/Ewe_language"); + break; + case "fon": + TYPE_TAGS.addTypeURI("fon_type", "http://dbpedia.org/resource/Fon_language"); + break; + case "mos": + TYPE_TAGS.addTypeURI("mossi_type", "http://dbpedia.org/resource/Mossi_language"); + break; + case "bbj": + TYPE_TAGS.addTypeURI("ghomala_type", "http://dbpedia.org/resource/Ghomala_language"); + break; + case "nya": + TYPE_TAGS.addTypeURI("chichewa_type", "http://dbpedia.org/resource/Chichewa_language"); + break; + case "tsn": + TYPE_TAGS.addTypeURI("setswana_type", "http://dbpedia.org/resource/Setswana_language"); + break; + case "twi": + TYPE_TAGS.addTypeURI("twi_type", "http://dbpedia.org/resource/Twi_language"); + break; + case "sna": + TYPE_TAGS.addTypeURI("chishona_type", "http://dbpedia.org/resource/Chishona_language"); + break; + case "xho": + TYPE_TAGS.addTypeURI("isixhosa_type", "http://dbpedia.org/resource/IsiXhosa_language"); + break; + case "zul": + TYPE_TAGS.addTypeURI("isizulu_type", "http://dbpedia.org/resource/IsiZulu_language"); + break; } } } From 6341f9c7e6c08bfdf5e265f359f8c90c0dc18151 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:06:48 +0200 Subject: [PATCH 06/25] masakha: added dataset.properties of languages --- src/main/properties/datasets.properties | 179 +++++++++++++++++++++++- 1 file changed, 178 insertions(+), 1 deletion(-) diff --git a/src/main/properties/datasets.properties b/src/main/properties/datasets.properties index 6740aab06..6dce297d8 100644 --- a/src/main/properties/datasets.properties +++ b/src/main/properties/datasets.properties @@ -476,7 +476,184 @@ org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.name=MasakhaNER-Yoruba- org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/train.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=RT2KB +### Bambara language +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.name=MasakhaNER-Bambara-Dev +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.name=MasakhaNER-Bambara-Test +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.name=MasakhaNER-Bambara-Train +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.experimentType=RT2KB +### Ewe language +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.name=MasakhaNER-Ewe-Dev +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.name=MasakhaNER-Ewe-Test +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.name=MasakhaNER-Ewe-Train +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.experimentType=RT2KB +### Fon language +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.name=MasakhaNER-Fon-Dev +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.name=MasakhaNER-Fon-Test +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.name=MasakhaNER-Fon-Train +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.experimentType=RT2KB +### Mossi language +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.name=MasakhaNER-Mossi-Dev +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.name=MasakhaNER-Mossi-Test +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.name=MasakhaNER-Mossi-Train +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.experimentType=RT2KB +### Ghomala language +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.name=MasakhaNER-Ghomala-Dev +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.name=MasakhaNER-Ghomala-Test +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.name=MasakhaNER-Ghomala-Train +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.experimentType=RT2KB +### Chichewa language +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.name=MasakhaNER-Chichewa-Dev +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.name=MasakhaNER-Chichewa-Test +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.name=MasakhaNER-Chichewa-Train +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.experimentType=RT2KB +### Setswana language +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.name=MasakhaNER-Setswana-Dev +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.name=MasakhaNER-Setswana-Test +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.name=MasakhaNER-Setswana-Train +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.experimentType=RT2KB + +### (Akan/Twi) language +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.name=MasakhaNER-Twi-Dev +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.name=MasakhaNER-Twi-Test +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/test.txt +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.name=MasakhaNER-Twi-Train +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/train.txt +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.experimentType=RT2KB +### chiShona language +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.name=MasakhaNER-chiShona-Dev +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.name=MasakhaNER-chiShona-Test +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/test.txt +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.name=MasakhaNER-chiShona-Train +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/train.txt +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.experimentType=RT2KB +### isiXhosa language +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.name=MasakhaNER-isiXhosa-Dev +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.name=MasakhaNER-isiXhosa-Test +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/test.txt +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.name=MasakhaNER-isiXhosa-Train +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/train.txt +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.experimentType=RT2KB +### isiZulu language +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.name=MasakhaNER-isiZulu-Dev +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.name=MasakhaNER-isiZulu-Test +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/test.txt +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.name=MasakhaNER-isiZulu-Train +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/train.txt +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.experimentType=RT2KB ### Meij #org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list From b83e9aea0f4fc3d850929bdd1f58d83a0a723759 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 10:07:10 +0200 Subject: [PATCH 07/25] added test case for languages of masakha 2 --- .../impl/masakha/MasakhaNERDatasetTest.java | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 38e47cd17..4270f4cf8 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -67,7 +67,52 @@ public static Collection data() { // Yoruba language testConfigs.add(new Object[] { "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", - new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Yoruba_language"), 9, 0 }); + new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Yoruba_language"), 9, 0 }); + // Bambara language + testConfigs.add(new Object[] { + "Nin O waati O in O na O , O a O ka O gɛlɛn O mɔgɔ O k'i O dantigɛ O a O fatuli O kun O jɔnjɔnw O kan O , O k'a O da O a O kan O a O sababuw O tolen O bɛ O dibi O bɛ O . O", + new TypedSpanImpl(0, 1, "http://dbpedia.org/resource/Bambara_language"), 12, 0 }); + // Ghomala language + testConfigs.add(new Object[] { + "Msaʼnyə̂ O gɔtí O cyətə O nə́ O bǎyá O cyə́ O nəjí O pôʼ O bǎhə́lə́ O", + new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Ghomala_language"), 16, 0 }); + // Ewe language + testConfigs.add(new Object[] { + "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", + new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Ewe_language"), 13, 0 }); + // Fon language + testConfigs.add(new Object[] { + "È O ká O wɔn O dotóo O lɛ́ɛ O ɖesu O ǎ O . O", + new TypedSpanImpl(0, 1, "http://dbpedia.org/resource/Fon_language"), 14, 0 }); + // Mossi language + testConfigs.add(new Object[] { + "Yao O sãan O wa O mikame O tɩ O lamd O n O dɩk O bugm O . O", + new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Mossi_language"), 10, 0 }); + // Chichewa language + testConfigs.add(new Object[] { + "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", + new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/Chichewa_language"), 17, 0 }); + // Setswana language + testConfigs.add(new Object[] { + "E O ne O e O le O motlotli O wa O dikgang O yo O gaisang O . O", + new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Setswana_language"), 18, 0 }); + // Twi (Akan/Twi) language + testConfigs.add(new Object[] { + "Sɛ O yɛwɔ O tema O ma O obi O a O , O yebehu O sɛ O ɛnsono O yɛn O ɛnna O ɛsono O ɔno O . O", + new TypedSpanImpl(4, 1, "http://dbpedia.org/resource/Akan_language"), 15, 0 }); + // chiShona language + testConfigs.add(new Object[] { + "Huwandu O uhu O hunotarisirwa O kukwira O . O", + new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/chiShona_language"), 19, 0 }); + // isiXhosa language + testConfigs.add(new Object[] { + "Konakala O izinto O emsebenzini O emva O kokoyisakala O bubuthonga O ngenxa O yobude O bendlela O . O", + new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/isiXhosa_language"), 20, 0 }); + // isiZulu language + testConfigs.add(new Object[] { + "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", + new TypedSpanImpl(4, 1, "http://dbpedia.org/resource/isiZulu_language"), 21, 0 }); + return testConfigs; } From 8d44f2882b9d84dd9707587e6899a11216b2e569 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 11:38:27 +0200 Subject: [PATCH 08/25] RESOLVE: updated test config --- .../impl/masakha/MasakhaNERDatasetTest.java | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 4270f4cf8..9e914c9c0 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -31,86 +31,107 @@ public static Collection data() { // Amharic language testConfigs.add(new Object[] { "እንደ O\n ኤለርስ B-LOC\n አብዛኞቹ O\n የአፍሪቃ B-LOC\n ሀገሮች O\n በአብዛኛዉ O\n የሥራ O\n ቦታ O\n ያለዉ O\n እርሻ O\n ላይ O\n ነዉ O\n ። O", - new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Amharic_language"), 0, 0 }); + "እንደ O ኤለርስ B-LOC አብዛኞቹ O የአፍሪቃ B-LOC ሀገሮች O በአብዛኛዉ O የሥራ O ቦታ O ያለዉ O እርሻ O ላይ O ነዉ O ። O", + new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Amharic_language"), 0, 0 }); // Hausa language testConfigs.add(new Object[] { "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", - new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Igbo_language"), 1, 0 }); + "Ga O dai O cikakken O hirar O : O", + new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Igbo_language"), 1, 0 }); // Igbo language testConfigs.add(new Object[] { "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", - new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 2, 0 }); + "Igbokwe B-PER sịrị O n'aka O ndị O ndu O APC B-ORG adịghị O n'ihe O mere O n'Oshodi B-LOC - O Joe B-PER Igbokwe I-PER", + new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 2, 0 }); // Kinyarwanda language testConfigs.add(new Object[] { "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", - new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 3, 0 }); + "Amabwiriza O yo O kubungabunga O ubuzima O hirindwa O COVID O - O 19 O", + new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 3, 0 }); // Luganda language testConfigs.add(new Object[] { "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", + "Ekivvulu O kino O Kiwagiddwa O Vision B-ORG Group I-ORG efulumya O ne O Bukedde B-ORG . O", new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Luganda_language"), 4, 0 }); // Luo language testConfigs.add(new Object[] { "Kwan O\n jii O\n maromo O\n 796 O\n mane O\n oyudi O\n ni O\n nigi O\n Covid O\n - O\n 19 O\n ei O\n kawuononi B-DATE", + "Jii O adek O mawuok O e O familia O achiel O polo O onego O", new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Luo_language"), 5, 0 }); // Nigerian Pidgin language testConfigs.add(new Object[] { - "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n", ", O\n", "Florida B-LOC\n", ". O", + "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", + "Dat O na O how O we O take O start O wit O label O . O", new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Nigerian_Pidgin"), 6, 0 }); // Swahili language testConfigs.add(new Object[] { "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", + "Watu O watatu O wengine O waliokuwa O ndani O ya O basi O wameripotiwa O kujeruhiwa O katika O shambulizi O hilo O . O", new TypedSpanImpl(3, 2, "http://dbpedia.org/resource/Swahili_language"), 7, 0 }); // Wolof language testConfigs.add(new Object[] { "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", - new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Wolof_language"), 8, 0 }); + "Waxatuñu O dara O , O nga O lebi O xaalis O bu O dul O jeex O ci O turu O askan O wi O . O", + new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Wolof_language"), 8, 0 }); // Yoruba language testConfigs.add(new Object[] { "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", - new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Yoruba_language"), 9, 0 }); + "Àwòrán O àgékù O láti O ibùdó O Channel B-ORG Television I-ORG You I-ORG Tube I-ORG . O", + new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Yoruba_language"), 9, 0 }); // Bambara language testConfigs.add(new Object[] { "Nin O waati O in O na O , O a O ka O gɛlɛn O mɔgɔ O k'i O dantigɛ O a O fatuli O kun O jɔnjɔnw O kan O , O k'a O da O a O kan O a O sababuw O tolen O bɛ O dibi O bɛ O . O", + "Sɔrɔdasiw O ye O polisiw O labɔ O kulusigi O jɔyɔrɔ O fɔlɔ O . O", new TypedSpanImpl(0, 1, "http://dbpedia.org/resource/Bambara_language"), 12, 0 }); // Ghomala language testConfigs.add(new Object[] { "Msaʼnyə̂ O gɔtí O cyətə O nə́ O bǎyá O cyə́ O nəjí O pôʼ O bǎhə́lə́ O", + "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Ghomala_language"), 16, 0 }); // Ewe language testConfigs.add(new Object[] { "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", + "Le O Togo B-LOC : O wotso O afia O na O ame O 200 O ɖe O akpa O sesẽ O nu O wɔwɔ O ame O ŋu O . O", new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Ewe_language"), 13, 0 }); // Fon language testConfigs.add(new Object[] { "È O ká O wɔn O dotóo O lɛ́ɛ O ɖesu O ǎ O . O", + "Sɔmì O sɔmì O sɛ́n O ɔ́ O lɔ́ɔ O dó O zogbeji O . O", new TypedSpanImpl(0, 1, "http://dbpedia.org/resource/Fon_language"), 14, 0 }); // Mossi language testConfigs.add(new Object[] { "Yao O sãan O wa O mikame O tɩ O lamd O n O dɩk O bugm O . O", + "Sẽn O geta O tẽnga O yell O rɛɛgdã O a O Tamotsu B-PER Ikezaki I-PER menga O zĩinda O tʋʋdã O tɩʋʋsgo O . O", new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Mossi_language"), 10, 0 }); // Chichewa language testConfigs.add(new Object[] { "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", + "Dziko O lino O lili O ndi O zipani O zoposera O 50 O . O", new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/Chichewa_language"), 17, 0 }); // Setswana language testConfigs.add(new Object[] { "E O ne O e O le O motlotli O wa O dikgang O yo O gaisang O . O", + "Mo O bidiong O e O e O fa O tlase O , O O My O Octopus O Teacher O e O ne O newa O sekgele O sa O Oscar B-ORG . O", new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Setswana_language"), 18, 0 }); // Twi (Akan/Twi) language testConfigs.add(new Object[] { "Sɛ O yɛwɔ O tema O ma O obi O a O , O yebehu O sɛ O ɛnsono O yɛn O ɛnna O ɛsono O ɔno O . O", + "Ɛmfa O ho O nea O wobɛyɛ O biara O no O , O ɛsɛ O sɛ O wudi O nhyehyɛe O pa O no O akyi O . O", new TypedSpanImpl(4, 1, "http://dbpedia.org/resource/Akan_language"), 15, 0 }); // chiShona language testConfigs.add(new Object[] { "Huwandu O uhu O hunotarisirwa O kukwira O . O", + "Patafonera O chipatara O ichi O chati O titumire O mibvunzo O asi O tange O tisati O tawana O mhinduro O pataenda O kumhepo O . O", new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/chiShona_language"), 19, 0 }); // isiXhosa language testConfigs.add(new Object[] { "Konakala O izinto O emsebenzini O emva O kokoyisakala O bubuthonga O ngenxa O yobude O bendlela O . O", + "Ọwọ́ O líle O làwọn O ẹ̀gbọ́n O mi O tí O mo O gbé O ọ̀dọ̀ O wọn O fi O mú O mi O . O", new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/isiXhosa_language"), 20, 0 }); // isiZulu language testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", + "Eqhuba O , O uqinisekisile O emalungwini O omphakathi O ukuthi O umasipala O uzozibophezela O ekuphuthumiseni O izidingo O zawo O . O", new TypedSpanImpl(4, 1, "http://dbpedia.org/resource/isiZulu_language"), 21, 0 }); return testConfigs; From e301b82770559f6a741d81dbb7a4776fb3ffda66 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Thu, 13 Jul 2023 13:23:44 +0200 Subject: [PATCH 09/25] fixed URI --- .../aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java index dc40a8af6..ef2ab1649 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -28,10 +28,10 @@ public void setLanguage(String language) { TYPE_TAGS.addTypeURI("amharic_type", "http://dbpedia.org/resource/Amharic_language"); break; case "hau": - TYPE_TAGS.addTypeURI("hausa_type", "http://dbpedia.org/resource/Igbo_language"); + TYPE_TAGS.addTypeURI("hausa_type", "http://dbpedia.org/resource/Hausa_language"); break; case "ibo": - TYPE_TAGS.addTypeURI("igbo_type", "http://dbpedia.org/resource/Kinyarwanda_language"); + TYPE_TAGS.addTypeURI("igbo_type", "http://dbpedia.org/resource/Igbo_language"); break; case "kin": TYPE_TAGS.addTypeURI("kinyarwanda_type", "http://dbpedia.org/resource/Kinyarwanda_language"); From 678ee1fc49db7c282db00cbb41838e7bb409e451 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Tue, 18 Jul 2023 22:51:08 +0200 Subject: [PATCH 10/25] updated punctuation handling in masakha --- .../impl/conll/CoNLLTypeRetriever.java | 4 +- .../impl/conll/GenericCoNLLDataset.java | 7 +- .../impl/derczysnki/DerczynskiDataset.java | 2 +- .../dataset/impl/indqner/IndQNERDataset.java | 2 +- .../impl/masakha/MasakhaNERDataset.java | 80 +------------------ .../dataset/impl/ritter/RitterDataset.java | 2 +- .../gerbil/dataset/impl/umbc/UMBCDataset.java | 2 +- .../gerbil/dataset/impl/wnut/WNUTDataset.java | 2 +- 8 files changed, 12 insertions(+), 89 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java index 2ff118ac1..0257f0cc4 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/CoNLLTypeRetriever.java @@ -36,12 +36,11 @@ public class CoNLLTypeRetriever { private static final String SPORTS_TEAM_URI = "http://dbpedia.org/ontology/SportsTeam"; private static final String TV_SHOW_URI = "http://dbpedia.org/ontology/TelevisionShow"; private static final String ORGANISATION_URI = "http://dbpedia.org/ontology/Organisation"; - private static final String LANGUAGE = "https://dbpedia.org/ontology/language"; private Map annotationToType; public CoNLLTypeRetriever(String place, String company, String film, String musicalArtist, String unknown, - String person, String product, String sportsTeam, String tvShow, String organisation, String language) { + String person, String product, String sportsTeam, String tvShow, String organisation) { annotationToType = new HashMap<>(); annotationToType.put(place, PLACE_URI); annotationToType.put(company, COMPANY_URI); @@ -53,7 +52,6 @@ public CoNLLTypeRetriever(String place, String company, String film, String musi annotationToType.put(sportsTeam, SPORTS_TEAM_URI); annotationToType.put(tvShow, TV_SHOW_URI); annotationToType.put(organisation, ORGANISATION_URI); - annotationToType.put(language, LANGUAGE); } /** diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 0b02b7061..a618d84e3 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -183,6 +183,8 @@ protected List loadDocuments(File file) throws GerbilException { case ']': case '}': case '.': + whiteSpaceInFront = false; + break; case '፠': // ፠ section mark case '፡': // ፡ word separator case '።': // ። full stop (period) @@ -192,9 +194,8 @@ protected List loadDocuments(File file) throws GerbilException { case '፦': // ፦ preface colon case '፧': // ፧ question mark case '፨': // ፨ paragraph separator - // Set whiteSpaceInFront to false if the character is a punctuation mark - // that does not require a whitespace in front - whiteSpaceInFront = false; + whiteSpaceInFront = true; + whiteSpaceBehind = true; break; case '"': // Toggle whiteSpaceBehind if the character is a quote mark diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java index 79fa82568..1aa2105e8 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/derczysnki/DerczynskiDataset.java @@ -24,7 +24,7 @@ public class DerczynskiDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 2; private static final int URI_COLUMN = 1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("geo-loc", "company", "movie", "musicartist", - "other", "person", "product", "sportsteam", "tv-show", null, null); + "other", "person", "product", "sportsteam", "tv-show", null); public DerczynskiDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java index 3561cf75f..93b49bdfd 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDataset.java @@ -16,7 +16,7 @@ public class IndQNERDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("GeographicalLocation", null, null, null, - null, "Person", null, null, null, null, null); + null, "Person", null, null, null, null); public IndQNERDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java index ef2ab1649..74b2d9d0e 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -7,87 +7,11 @@ public class MasakhaNERDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 2; private static final int URI_COLUMN = 1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever( - "GeographicalLocation", null, null, null, null, null, null, - null, null, null, "language"); - private String languageCode; + "LOC", null, null, null, "DATE", "PER", null, + null, null, "ORG"); public MasakhaNERDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); } - - /** - * Sets the language code for the dataset. - * - * @param language the language code (ISO 639-2) - */ - public void setLanguage(String language) { - languageCode = language.toLowerCase(); - - switch (languageCode) { - case "amh": - TYPE_TAGS.addTypeURI("amharic_type", "http://dbpedia.org/resource/Amharic_language"); - break; - case "hau": - TYPE_TAGS.addTypeURI("hausa_type", "http://dbpedia.org/resource/Hausa_language"); - break; - case "ibo": - TYPE_TAGS.addTypeURI("igbo_type", "http://dbpedia.org/resource/Igbo_language"); - break; - case "kin": - TYPE_TAGS.addTypeURI("kinyarwanda_type", "http://dbpedia.org/resource/Kinyarwanda_language"); - break; - case "lug": - TYPE_TAGS.addTypeURI("luganda_type", "http://dbpedia.org/resource/Luganda_language"); - break; - case "lu": - TYPE_TAGS.addTypeURI("luo_type", "http://dbpedia.org/resource/Luo_language"); - break; - case "pcm": - TYPE_TAGS.addTypeURI("pidgin_type", "http://dbpedia.org/resource/Nigerian_Pidgin"); - break; - case "swa": - TYPE_TAGS.addTypeURI("swahili_type", "http://dbpedia.org/resource/Swahili_language"); - break; - case "wol": - TYPE_TAGS.addTypeURI("wolof_type", "http://dbpedia.org/resource/Wolof_language"); - break; - case "yor": - TYPE_TAGS.addTypeURI("yoruba_type", "http://dbpedia.org/resource/Yoruba_language"); - break; - case "bam": - TYPE_TAGS.addTypeURI("bambara_type", "http://dbpedia.org/resource/Bambara_language"); - break; - case "ewe": - TYPE_TAGS.addTypeURI("ewe_type", "http://dbpedia.org/resource/Ewe_language"); - break; - case "fon": - TYPE_TAGS.addTypeURI("fon_type", "http://dbpedia.org/resource/Fon_language"); - break; - case "mos": - TYPE_TAGS.addTypeURI("mossi_type", "http://dbpedia.org/resource/Mossi_language"); - break; - case "bbj": - TYPE_TAGS.addTypeURI("ghomala_type", "http://dbpedia.org/resource/Ghomala_language"); - break; - case "nya": - TYPE_TAGS.addTypeURI("chichewa_type", "http://dbpedia.org/resource/Chichewa_language"); - break; - case "tsn": - TYPE_TAGS.addTypeURI("setswana_type", "http://dbpedia.org/resource/Setswana_language"); - break; - case "twi": - TYPE_TAGS.addTypeURI("twi_type", "http://dbpedia.org/resource/Twi_language"); - break; - case "sna": - TYPE_TAGS.addTypeURI("chishona_type", "http://dbpedia.org/resource/Chishona_language"); - break; - case "xho": - TYPE_TAGS.addTypeURI("isixhosa_type", "http://dbpedia.org/resource/IsiXhosa_language"); - break; - case "zul": - TYPE_TAGS.addTypeURI("isizulu_type", "http://dbpedia.org/resource/IsiZulu_language"); - break; - } - } } diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java index c3ac0a57c..2595960a5 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/ritter/RitterDataset.java @@ -24,7 +24,7 @@ public class RitterDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("geo-loc", null, "movie", "musicartist", - "other", "person", "product", "sportsteam", "tvshow", null, null); + "other", "person", "product", "sportsteam", "tvshow", null); public RitterDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java index 136a26f40..15b740551 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDataset.java @@ -24,7 +24,7 @@ public class UMBCDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("LOC", null, null, null, - null, "PER", null, null, null, "ORG", null); + null, "PER", null, null, null, "ORG"); public UMBCDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java index f1b5a7809..818c69113 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/wnut/WNUTDataset.java @@ -27,7 +27,7 @@ public class WNUTDataset extends GenericCoNLLDataset { private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("location", "corporation", null, null, - null, "person", "product", null, null, null, null); + null, "person", "product", null, null, null); public WNUTDataset(String file) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); From 186623dd736f040b1e9792d0294f1a737a5a22af Mon Sep 17 00:00:00 2001 From: neha2022 Date: Tue, 18 Jul 2023 22:52:00 +0200 Subject: [PATCH 11/25] updated starting and length on the test file --- .../impl/masakha/MasakhaNERDatasetTest.java | 108 +++++++++--------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 9e914c9c0..d36a29125 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -29,110 +29,110 @@ public InitializableDataset createDataset(File file) { public static Collection data() { List testConfigs = new ArrayList<>(); // Amharic language - testConfigs.add(new Object[] { - "እንደ O\n ኤለርስ B-LOC\n አብዛኞቹ O\n የአፍሪቃ B-LOC\n ሀገሮች O\n በአብዛኛዉ O\n የሥራ O\n ቦታ O\n ያለዉ O\n እርሻ O\n ላይ O\n ነዉ O\n ። O", - "እንደ O ኤለርስ B-LOC አብዛኞቹ O የአፍሪቃ B-LOC ሀገሮች O በአብዛኛዉ O የሥራ O ቦታ O ያለዉ O እርሻ O ላይ O ነዉ O ። O", - new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Amharic_language"), 0, 0 }); + testConfigs.add(new Object[] { + "ቀዳሚው O የሶማሌ B-LOC ክልል I-LOC በአወዳይ I-LOC ከተማ I-LOC ለተገደሉ O የክልሉ O ተወላጆች O ያከናወነው O የቀብር O ስነ O ስርዓትን O የተመለከተ O ዘገባ O ነው O ፡፡ O", + "ቀዳሚው የሶማሌ ክልል በአወዳይ ከተማ ለተገደሉ የክልሉ ተወላጆች ያከናወነው የቀብር ስነ ስርዓትን የተመለከተ ዘገባ ነው ፡፡ ", + new TypedSpanImpl(5, 8, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Hausa language - testConfigs.add(new Object[] { - "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", - "Ga O dai O cikakken O hirar O : O", - new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Igbo_language"), 1, 0 }); + testConfigs.add(new Object[] { + "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", + "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", + new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 1, 0 }); // Igbo language testConfigs.add(new Object[] { "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", - "Igbokwe B-PER sịrị O n'aka O ndị O ndu O APC B-ORG adịghị O n'ihe O mere O n'Oshodi B-LOC - O Joe B-PER Igbokwe I-PER", - new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 2, 0 }); + "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", + new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 2, 0 }); // Kinyarwanda language testConfigs.add(new Object[] { "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", - "Amabwiriza O yo O kubungabunga O ubuzima O hirindwa O COVID O - O 19 O", - new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Kinyarwanda_language"), 3, 0 }); + "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", + new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 3, 0 }); // Luganda language testConfigs.add(new Object[] { "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", - "Ekivvulu O kino O Kiwagiddwa O Vision B-ORG Group I-ORG efulumya O ne O Bukedde B-ORG . O", - new TypedSpanImpl(4, 2, "http://dbpedia.org/resource/Luganda_language"), 4, 0 }); + "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", + new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 4, 0 }); // Luo language testConfigs.add(new Object[] { - "Kwan O\n jii O\n maromo O\n 796 O\n mane O\n oyudi O\n ni O\n nigi O\n Covid O\n - O\n 19 O\n ei O\n kawuononi B-DATE", - "Jii O adek O mawuok O e O familia O achiel O polo O onego O", - new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Luo_language"), 5, 0 }); + "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", + "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", + new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 5, 0 }); // Nigerian Pidgin language testConfigs.add(new Object[] { "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", - "Dat O na O how O we O take O start O wit O label O . O", - new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Nigerian_Pidgin"), 6, 0 }); + "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", + new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 6, 0 }); // Swahili language testConfigs.add(new Object[] { "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", - "Watu O watatu O wengine O waliokuwa O ndani O ya O basi O wameripotiwa O kujeruhiwa O katika O shambulizi O hilo O . O", - new TypedSpanImpl(3, 2, "http://dbpedia.org/resource/Swahili_language"), 7, 0 }); + "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", + new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 7, 0 }); // Wolof language testConfigs.add(new Object[] { "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", - "Waxatuñu O dara O , O nga O lebi O xaalis O bu O dul O jeex O ci O turu O askan O wi O . O", - new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Wolof_language"), 8, 0 }); + "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", + new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 8, 0 }); // Yoruba language testConfigs.add(new Object[] { "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", - "Àwòrán O àgékù O láti O ibùdó O Channel B-ORG Television I-ORG You I-ORG Tube I-ORG . O", - new TypedSpanImpl(2, 2, "http://dbpedia.org/resource/Yoruba_language"), 9, 0 }); + "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", + new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 9, 0 }); // Bambara language testConfigs.add(new Object[] { - "Nin O waati O in O na O , O a O ka O gɛlɛn O mɔgɔ O k'i O dantigɛ O a O fatuli O kun O jɔnjɔnw O kan O , O k'a O da O a O kan O a O sababuw O tolen O bɛ O dibi O bɛ O . O", - "Sɔrɔdasiw O ye O polisiw O labɔ O kulusigi O jɔyɔrɔ O fɔlɔ O . O", - new TypedSpanImpl(0, 1, "http://dbpedia.org/resource/Bambara_language"), 12, 0 }); + "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", + "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", + new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 10, 0 }); // Ghomala language testConfigs.add(new Object[] { - "Msaʼnyə̂ O gɔtí O cyətə O nə́ O bǎyá O cyə́ O nəjí O pôʼ O bǎhə́lə́ O", "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", - new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Ghomala_language"), 16, 0 }); + "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", + new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 11, 0 }); // Ewe language testConfigs.add(new Object[] { "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", - "Le O Togo B-LOC : O wotso O afia O na O ame O 200 O ɖe O akpa O sesẽ O nu O wɔwɔ O ame O ŋu O . O", - new TypedSpanImpl(1, 1, "http://dbpedia.org/resource/Ewe_language"), 13, 0 }); + "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", + new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 12, 0 }); // Fon language testConfigs.add(new Object[] { - "È O ká O wɔn O dotóo O lɛ́ɛ O ɖesu O ǎ O . O", - "Sɔmì O sɔmì O sɛ́n O ɔ́ O lɔ́ɔ O dó O zogbeji O . O", - new TypedSpanImpl(0, 1, "http://dbpedia.org/resource/Fon_language"), 14, 0 }); + "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", + "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", + new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 13, 0 }); // Mossi language testConfigs.add(new Object[] { - "Yao O sãan O wa O mikame O tɩ O lamd O n O dɩk O bugm O . O", - "Sẽn O geta O tẽnga O yell O rɛɛgdã O a O Tamotsu B-PER Ikezaki I-PER menga O zĩinda O tʋʋdã O tɩʋʋsgo O . O", - new TypedSpanImpl(1, 2, "http://dbpedia.org/resource/Mossi_language"), 10, 0 }); + "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", + "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", + new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 14, 0 }); // Chichewa language testConfigs.add(new Object[] { "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", - "Dziko O lino O lili O ndi O zipani O zoposera O 50 O . O", - new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/Chichewa_language"), 17, 0 }); + "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", + new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 15, 0 }); // Setswana language testConfigs.add(new Object[] { - "E O ne O e O le O motlotli O wa O dikgang O yo O gaisang O . O", - "Mo O bidiong O e O e O fa O tlase O , O O My O Octopus O Teacher O e O ne O newa O sekgele O sa O Oscar B-ORG . O", - new TypedSpanImpl(2, 1, "http://dbpedia.org/resource/Setswana_language"), 18, 0 }); + "Zuma B-PER o O ipolela O a O se O molato O. O", + "Zuma o ipolela a se molato.", + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 16, 0 }); // Twi (Akan/Twi) language testConfigs.add(new Object[] { - "Sɛ O yɛwɔ O tema O ma O obi O a O , O yebehu O sɛ O ɛnsono O yɛn O ɛnna O ɛsono O ɔno O . O", - "Ɛmfa O ho O nea O wobɛyɛ O biara O no O , O ɛsɛ O sɛ O wudi O nhyehyɛe O pa O no O akyi O . O", - new TypedSpanImpl(4, 1, "http://dbpedia.org/resource/Akan_language"), 15, 0 }); + "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", + "Paul resusu sika dodow a ohia na ɔde awie fie no.", + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 17, 0 }); // chiShona language testConfigs.add(new Object[] { - "Huwandu O uhu O hunotarisirwa O kukwira O . O", - "Patafonera O chipatara O ichi O chati O titumire O mibvunzo O asi O tange O tisati O tawana O mhinduro O pataenda O kumhepo O . O", - new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/chiShona_language"), 19, 0 }); + "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", + "Messi ndiye akarova penalty yekutanga akatadza.", + new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 18, 0 }); // isiXhosa language testConfigs.add(new Object[] { - "Konakala O izinto O emsebenzini O emva O kokoyisakala O bubuthonga O ngenxa O yobude O bendlela O . O", - "Ọwọ́ O líle O làwọn O ẹ̀gbọ́n O mi O tí O mo O gbé O ọ̀dọ̀ O wọn O fi O mú O mi O . O", - new TypedSpanImpl(3, 1, "http://dbpedia.org/resource/isiXhosa_language"), 20, 0 }); + "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", + "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", + new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 19, 0 }); // isiZulu language testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", - "Eqhuba O , O uqinisekisile O emalungwini O omphakathi O ukuthi O umasipala O uzozibophezela O ekuphuthumiseni O izidingo O zawo O . O", - new TypedSpanImpl(4, 1, "http://dbpedia.org/resource/isiZulu_language"), 21, 0 }); + "IMeya yeTheku ingenelela enkingeni yombhikisho", + new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 20, 0 }); return testConfigs; } From 1cc00ab81efb79f4b96be22d0826be59ea2f1281 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Wed, 19 Jul 2023 09:59:54 +0200 Subject: [PATCH 12/25] updated correct experiment type for masakha dataset --- src/main/properties/datasets.properties | 126 +++++++++--------- .../impl/masakha/MasakhaNERDatasetTest.java | 6 +- 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/src/main/properties/datasets.properties b/src/main/properties/datasets.properties index 6dce297d8..7e51e55e6 100644 --- a/src/main/properties/datasets.properties +++ b/src/main/properties/datasets.properties @@ -322,338 +322,338 @@ org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.name=MasakhaNER-Amharic- org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.name=MasakhaNER-Amharic-Test org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/test.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.name=MasakhaNER-Amharic-Train org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/train.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.experimentType=D2KB ### Hausa org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.name=MasakhaNER-Hausa-Dev org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.name=MasakhaNER-Hausa-Test org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/test.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.name=MasakhaNER-Hausa-Train org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/train.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.experimentType=D2KB ### Igbo org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.name=MasakhaNER-Igbo-Dev org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.name=MasakhaNER-Igbo-Test org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/test.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.name=MasakhaNER-Igbo-Train org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/train.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.experimentType=D2KB ### Kinyarwanda org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.name=MasakhaNER-Kinyarwanda-Dev org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.name=MasakhaNER-Kinyarwanda-Test org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/test.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.name=MasakhaNER-Kinyarwanda-Train org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/train.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.experimentType=D2KB ### Luganda org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.name=MasakhaNER-Luganda-Dev org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.name=MasakhaNER-Luganda-Test org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/test.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.name=MasakhaNER-Luganda-Train org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/train.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.experimentType=D2KB ### Luo org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.name=MasakhaNER-Luo-Dev org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.name=MasakhaNER-Luo-Test org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/test.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.name=MasakhaNER-Luo-Train org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/train.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.experimentType=D2KB ### Naija org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.name=MasakhaNER-Naija-Dev org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.name=MasakhaNER-Naija-Test org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/test.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.name=MasakhaNER-Naija-Train org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/train.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.experimentType=D2KB ### Swahili org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.name=MasakhaNER-Swahili-Dev org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.name=MasakhaNER-Swahili-Test org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/test.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.name=MasakhaNER-Swahili-Train org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/train.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.experimentType=D2KB ### Wolof org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.name=MasakhaNER-Wolof-Dev org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.name=MasakhaNER-Wolof-Test org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/test.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.name=MasakhaNER-Wolof-Train org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/train.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.experimentType=D2KB ### Yoruba org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.name=MasakhaNER-Yoruba-Dev org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.name=MasakhaNER-Yoruba-Test org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/test.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.name=MasakhaNER-Yoruba-Train org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/train.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=D2KB ### Bambara language org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.name=MasakhaNER-Bambara-Dev org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.name=MasakhaNER-Bambara-Test org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/test.txt org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.name=MasakhaNER-Bambara-Train org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/train.txt org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.experimentType=D2KB ### Ewe language org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.name=MasakhaNER-Ewe-Dev org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.name=MasakhaNER-Ewe-Test org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/test.txt org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.name=MasakhaNER-Ewe-Train org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/train.txt org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.experimentType=D2KB ### Fon language org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.name=MasakhaNER-Fon-Dev org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.name=MasakhaNER-Fon-Test org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/test.txt org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.name=MasakhaNER-Fon-Train org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/train.txt org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.experimentType=D2KB ### Mossi language org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.name=MasakhaNER-Mossi-Dev org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.name=MasakhaNER-Mossi-Test org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/test.txt org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.name=MasakhaNER-Mossi-Train org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/train.txt org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.experimentType=D2KB ### Ghomala language org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.name=MasakhaNER-Ghomala-Dev org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.name=MasakhaNER-Ghomala-Test org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/test.txt org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.name=MasakhaNER-Ghomala-Train org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/train.txt org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.experimentType=D2KB ### Chichewa language org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.name=MasakhaNER-Chichewa-Dev org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.name=MasakhaNER-Chichewa-Test org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/test.txt org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.name=MasakhaNER-Chichewa-Train org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/train.txt org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.experimentType=D2KB ### Setswana language org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.name=MasakhaNER-Setswana-Dev org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.name=MasakhaNER-Setswana-Test org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/test.txt org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.name=MasakhaNER-Setswana-Train org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/train.txt org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.experimentType=D2KB ### (Akan/Twi) language org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.name=MasakhaNER-Twi-Dev org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.name=MasakhaNER-Twi-Test org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/test.txt org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.name=MasakhaNER-Twi-Train org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/train.txt org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.experimentType=D2KB ### chiShona language org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.name=MasakhaNER-chiShona-Dev org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/dev.txt org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.name=MasakhaNER-chiShona-Test org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/test.txt org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.name=MasakhaNER-chiShona-Train org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/train.txt org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.experimentType=D2KB ### isiXhosa language org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.name=MasakhaNER-isiXhosa-Dev org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/dev.txt org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.name=MasakhaNER-isiXhosa-Test org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/test.txt org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.name=MasakhaNER-isiXhosa-Train org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/train.txt org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.experimentType=D2KB ### isiZulu language org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.name=MasakhaNER-isiZulu-Dev org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/dev.txt org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.name=MasakhaNER-isiZulu-Test org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/test.txt org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.experimentType=D2KB org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.name=MasakhaNER-isiZulu-Train org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/train.txt org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.experimentType=D2KB ### Meij #org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index d36a29125..c6b18ba19 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -30,9 +30,9 @@ public static Collection data() { List testConfigs = new ArrayList<>(); // Amharic language testConfigs.add(new Object[] { - "ቀዳሚው O የሶማሌ B-LOC ክልል I-LOC በአወዳይ I-LOC ከተማ I-LOC ለተገደሉ O የክልሉ O ተወላጆች O ያከናወነው O የቀብር O ስነ O ስርዓትን O የተመለከተ O ዘገባ O ነው O ፡፡ O", - "ቀዳሚው የሶማሌ ክልል በአወዳይ ከተማ ለተገደሉ የክልሉ ተወላጆች ያከናወነው የቀብር ስነ ስርዓትን የተመለከተ ዘገባ ነው ፡፡ ", - new TypedSpanImpl(5, 8, "http://dbpedia.org/ontology/Place"), 0, 0 }); + "የጀርመን B-LOC የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", + "የጀርመን የምርጫ ዘመቻን አስመልክቶ ከባልደረባችን ማንተጋፍቶት ስለሺ ጋር ቃለ ምልልስ አድርገናል ፡፡ ", + new TypedSpanImpl(32, 11, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Hausa language testConfigs.add(new Object[] { "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", From 06df8d4f7ae297766419b7007dc5383e606f87bb Mon Sep 17 00:00:00 2001 From: neha2022 Date: Wed, 19 Jul 2023 11:17:11 +0200 Subject: [PATCH 13/25] document Id updated --- .../impl/masakha/MasakhaNERDatasetTest.java | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index c6b18ba19..df9877f09 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -31,108 +31,108 @@ public static Collection data() { // Amharic language testConfigs.add(new Object[] { "የጀርመን B-LOC የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", - "የጀርመን የምርጫ ዘመቻን አስመልክቶ ከባልደረባችን ማንተጋፍቶት ስለሺ ጋር ቃለ ምልልስ አድርገናል ፡፡ ", - new TypedSpanImpl(32, 11, "http://dbpedia.org/ontology/Person"), 0, 0 }); + "የጀርመን የምርጫ ዘመቻን አስመልክቶ ከባልደረባችን ማንተጋፍቶት ስለሺ ጋር ቃለ ምልልስ አድርገናል ፡፡ ", + new TypedSpanImpl(32, 11, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Hausa language testConfigs.add(new Object[] { "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", - new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 1, 0 }); + new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Igbo language testConfigs.add(new Object[] { "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", - new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 2, 0 }); + new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Kinyarwanda language testConfigs.add(new Object[] { "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", - new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 3, 0 }); + new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 0, 0 }); // Luganda language testConfigs.add(new Object[] { "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", - new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 4, 0 }); + new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Luo language testConfigs.add(new Object[] { "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", - new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 5, 0 }); + new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 0, 0 }); // Nigerian Pidgin language testConfigs.add(new Object[] { "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", - new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 6, 0 }); + new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 0, 0 }); // Swahili language testConfigs.add(new Object[] { "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", - new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 7, 0 }); + new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Wolof language testConfigs.add(new Object[] { "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", - new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 8, 0 }); + new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Yoruba language testConfigs.add(new Object[] { "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", - new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 9, 0 }); + new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Bambara language testConfigs.add(new Object[] { "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", - new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 10, 0 }); + new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Ghomala language testConfigs.add(new Object[] { "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", - new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 11, 0 }); + new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Ewe language testConfigs.add(new Object[] { "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", - new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 12, 0 }); + new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Fon language testConfigs.add(new Object[] { "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", - new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 13, 0 }); + new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Mossi language testConfigs.add(new Object[] { "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", - new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 14, 0 }); + new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Chichewa language testConfigs.add(new Object[] { "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", - new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 15, 0 }); + new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 0, 0 }); // Setswana language testConfigs.add(new Object[] { "Zuma B-PER o O ipolela O a O se O molato O. O", "Zuma o ipolela a se molato.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 16, 0 }); + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); // Twi (Akan/Twi) language testConfigs.add(new Object[] { "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", "Paul resusu sika dodow a ohia na ɔde awie fie no.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 17, 0 }); + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); // chiShona language testConfigs.add(new Object[] { "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", "Messi ndiye akarova penalty yekutanga akatadza.", - new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 18, 0 }); + new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 0, 0 }); // isiXhosa language testConfigs.add(new Object[] { "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", - new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 19, 0 }); + new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 0, 0 }); // isiZulu language testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", "IMeya yeTheku ingenelela enkingeni yombhikisho", - new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 20, 0 }); + new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0 }); return testConfigs; } From 9dfae52223112071b0e4c1e307e8ddc877af72e9 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Wed, 19 Jul 2023 22:14:11 +0200 Subject: [PATCH 14/25] updated correct experiment type --- src/main/properties/datasets.properties | 126 ++++++++++++------------ 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/src/main/properties/datasets.properties b/src/main/properties/datasets.properties index 7e51e55e6..76f7a3b47 100644 --- a/src/main/properties/datasets.properties +++ b/src/main/properties/datasets.properties @@ -322,338 +322,338 @@ org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.name=MasakhaNER-Amharic- org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.name=MasakhaNER-Amharic-Test org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/test.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.name=MasakhaNER-Amharic-Train org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/train.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.experimentType=RT2KB ### Hausa org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.name=MasakhaNER-Hausa-Dev org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.name=MasakhaNER-Hausa-Test org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/test.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.name=MasakhaNER-Hausa-Train org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/train.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.experimentType=RT2KB ### Igbo org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.name=MasakhaNER-Igbo-Dev org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.name=MasakhaNER-Igbo-Test org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/test.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.name=MasakhaNER-Igbo-Train org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/train.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.experimentType=RT2KB ### Kinyarwanda org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.name=MasakhaNER-Kinyarwanda-Dev org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.name=MasakhaNER-Kinyarwanda-Test org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/test.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.name=MasakhaNER-Kinyarwanda-Train org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/train.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.experimentType=RT2KB ### Luganda org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.name=MasakhaNER-Luganda-Dev org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.name=MasakhaNER-Luganda-Test org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/test.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.name=MasakhaNER-Luganda-Train org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/train.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.experimentType=RT2KB ### Luo org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.name=MasakhaNER-Luo-Dev org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.name=MasakhaNER-Luo-Test org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/test.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.name=MasakhaNER-Luo-Train org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/train.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.experimentType=RT2KB ### Naija org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.name=MasakhaNER-Naija-Dev org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.name=MasakhaNER-Naija-Test org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/test.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.name=MasakhaNER-Naija-Train org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/train.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.experimentType=RT2KB ### Swahili org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.name=MasakhaNER-Swahili-Dev org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.name=MasakhaNER-Swahili-Test org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/test.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.name=MasakhaNER-Swahili-Train org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/train.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.experimentType=RT2KB ### Wolof org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.name=MasakhaNER-Wolof-Dev org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.name=MasakhaNER-Wolof-Test org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/test.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.name=MasakhaNER-Wolof-Train org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/train.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.experimentType=RT2KB ### Yoruba org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.name=MasakhaNER-Yoruba-Dev org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.name=MasakhaNER-Yoruba-Test org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/test.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.name=MasakhaNER-Yoruba-Train org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/train.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=RT2KB ### Bambara language org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.name=MasakhaNER-Bambara-Dev org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.name=MasakhaNER-Bambara-Test org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/test.txt org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.name=MasakhaNER-Bambara-Train org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/train.txt org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.experimentType=RT2KB ### Ewe language org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.name=MasakhaNER-Ewe-Dev org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.name=MasakhaNER-Ewe-Test org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/test.txt org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.name=MasakhaNER-Ewe-Train org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/train.txt org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.experimentType=RT2KB ### Fon language org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.name=MasakhaNER-Fon-Dev org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.name=MasakhaNER-Fon-Test org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/test.txt org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.name=MasakhaNER-Fon-Train org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/train.txt org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.experimentType=RT2KB ### Mossi language org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.name=MasakhaNER-Mossi-Dev org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.name=MasakhaNER-Mossi-Test org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/test.txt org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.name=MasakhaNER-Mossi-Train org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/train.txt org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.experimentType=RT2KB ### Ghomala language org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.name=MasakhaNER-Ghomala-Dev org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.name=MasakhaNER-Ghomala-Test org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/test.txt org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.name=MasakhaNER-Ghomala-Train org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/train.txt org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.experimentType=RT2KB ### Chichewa language org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.name=MasakhaNER-Chichewa-Dev org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.name=MasakhaNER-Chichewa-Test org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/test.txt org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.name=MasakhaNER-Chichewa-Train org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/train.txt org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.experimentType=RT2KB ### Setswana language org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.name=MasakhaNER-Setswana-Dev org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.name=MasakhaNER-Setswana-Test org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/test.txt org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.name=MasakhaNER-Setswana-Train org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/train.txt org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.experimentType=RT2KB ### (Akan/Twi) language org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.name=MasakhaNER-Twi-Dev org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.name=MasakhaNER-Twi-Test org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/test.txt org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.name=MasakhaNER-Twi-Train org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/train.txt org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.experimentType=RT2KB ### chiShona language org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.name=MasakhaNER-chiShona-Dev org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/dev.txt org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.name=MasakhaNER-chiShona-Test org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/test.txt org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.name=MasakhaNER-chiShona-Train org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/train.txt org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.experimentType=RT2KB ### isiXhosa language org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.name=MasakhaNER-isiXhosa-Dev org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/dev.txt org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.name=MasakhaNER-isiXhosa-Test org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/test.txt org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.name=MasakhaNER-isiXhosa-Train org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/train.txt org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.experimentType=RT2KB ### isiZulu language org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.name=MasakhaNER-isiZulu-Dev org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/dev.txt org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.name=MasakhaNER-isiZulu-Test org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/test.txt org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.experimentType=RT2KB org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.name=MasakhaNER-isiZulu-Train org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/train.txt org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.experimentType=D2KB +org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.experimentType=RT2KB ### Meij #org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list From 7bca7957241a5efb4736389c669ca770bcfa68b4 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Mon, 24 Jul 2023 22:47:14 +0200 Subject: [PATCH 15/25] resolved comments from git --- .../impl/conll/GenericCoNLLDataset.java | 507 +++++++++--------- .../impl/masakha/MasakhaNERDataset.java | 12 +- .../impl/masakha/MasakhaNERDatasetTest.java | 233 ++++---- 3 files changed, 372 insertions(+), 380 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index a618d84e3..7de5a7a7f 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -47,275 +47,272 @@ */ public class GenericCoNLLDataset extends AbstractDataset implements InitializableDataset { - private static final Logger LOGGER = LoggerFactory.getLogger(GenericCoNLLDataset.class); + private static final Logger LOGGER = LoggerFactory.getLogger(GenericCoNLLDataset.class); - /** - * Prefix of a value in the marking column that expresses the start of a - * marking. TODO think about removing the '-' or make it configurable. - */ - protected static final String MARKING_START = "B-"; + /** + * Prefix of a value in the marking column that expresses the start of a + * marking. TODO think about removing the '-' or make it configurable. + */ + protected static final String MARKING_START = "B-"; - /** - * Prefix of a value in the marking column that expresses the continuation of a - * marking. - */ - protected static final String MARKING_INSIDE = "I-"; + /** + * Prefix of a value in the marking column that expresses the continuation of a + * marking. + */ + protected static final String MARKING_INSIDE = "I-"; - /** - * The file from which the data will be loaded. - */ - protected String file; - /** - * The list of documents loaded from the file. - */ - protected List documents; - /** - * Id of the first document. - */ - protected int firstDocId; - /** - * Id of the last document. - */ - protected int lastDocId; - /** - * Class to map markings from the dataset to their type IRI. - */ - protected CoNLLTypeRetriever typeRetriever; - /** - * Id of the column that contains the annotations. - */ - protected int annotationColumn; - /** - * Id of the column that contains the entity's IRI. If there is no such column, - * it is set to -1. - */ - protected int uriColumn; - - + /** + * The file from which the data will be loaded. + */ + protected String file; + /** + * The list of documents loaded from the file. + */ + protected List documents; + /** + * Id of the first document. + */ + protected int firstDocId; + /** + * Id of the last document. + */ + protected int lastDocId; + /** + * Class to map markings from the dataset to their type IRI. + */ + protected CoNLLTypeRetriever typeRetriever; + /** + * Id of the column that contains the annotations. + */ + protected int annotationColumn; + /** + * Id of the column that contains the entity's IRI. If there is no such column, + * it is set to -1. + */ + protected int uriColumn; - public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { - this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); - } + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { + this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); + } - public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, - String firstDocId, String lastDocId) { - this(file, annotationColumn, uriColumn, typeRetriever, Integer.parseInt(firstDocId), - Integer.parseInt(lastDocId)); - } + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, + String firstDocId, String lastDocId) { + this(file, annotationColumn, uriColumn, typeRetriever, Integer.parseInt(firstDocId), + Integer.parseInt(lastDocId)); + } - public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, - int firstDocId, int lastDocId) { - this.file = file; - this.annotationColumn = annotationColumn; - this.uriColumn = uriColumn; - this.typeRetriever = typeRetriever; - this.firstDocId = firstDocId; - this.lastDocId = lastDocId; - } + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, + int firstDocId, int lastDocId) { + this.file = file; + this.annotationColumn = annotationColumn; + this.uriColumn = uriColumn; + this.typeRetriever = typeRetriever; + this.firstDocId = firstDocId; + this.lastDocId = lastDocId; + } - @Override - public int size() { - return documents.size(); - } + @Override + public int size() { + return documents.size(); + } - @Override - public List getInstances() { - return documents; - } + @Override + public List getInstances() { + return documents; + } - @Override - public void init() throws GerbilException { - this.documents = loadDocuments(new File(file)); - if ((firstDocId > 0) && (lastDocId > 0)) { - this.documents = this.documents.subList(firstDocId - 1, lastDocId); - } - } + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file)); + if ((firstDocId > 0) && (lastDocId > 0)) { + this.documents = this.documents.subList(firstDocId - 1, lastDocId); + } + } - /** - * This method loads the CoNLL dataset from the given file. - * - * @param file file from which the dataset will be loaded - * @return list of {@link Document} instances that have been loaded. - * @throws GerbilException if there is an IO error while reading the file. - */ - protected List loadDocuments(File file) throws GerbilException { - List documents = new ArrayList(); - // Create namespace for the documents of this dataset - String documentUriPrefix = "http://" + getName() + "/"; - StringBuilder textOfCurrentDocument = new StringBuilder(); - // Flag to track if a whitespace should be inserted in front of a line - boolean whiteSpaceInFront = true; - // Flag to track if a whitespace should be inserted behind a line - boolean whiteSpaceBehind = true; - try (BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));) { - String line = reader.readLine(); - // Id of the next document in this file - int index = 0; - List linesOfCurrentDoc = new ArrayList<>(); - // Iterate through all lines until the complete file has been read. - while (line != null) { - // If there is an empty line, the previous document ended and will be added to - // the list of documents - if (line.trim().isEmpty()) { - // If there is a document that can be added - if (linesOfCurrentDoc.size() > 0) { - // Get Markings - List markings = findMarkings(linesOfCurrentDoc, textOfCurrentDocument); - // Save the document - documents.add(new DocumentImpl(textOfCurrentDocument.toString(), documentUriPrefix + index, - markings)); - // Reset local variables - textOfCurrentDocument.delete(0, textOfCurrentDocument.length()); - linesOfCurrentDoc.clear(); - // Increase the document ID - index++; - } - } else { - if ((textOfCurrentDocument.length() > 0) && (line.length() >= 1)) { - if (line.length() == 1) { // Check if the line has only one character - char ch = line.charAt(0); // Get the character - switch (ch) { - case '?': - case '!': - case ',': - case ')': - case ']': - case '}': - case '.': - whiteSpaceInFront = false; - break; - case '፠': // ፠ section mark - case '፡': // ፡ word separator - case '።': // ። full stop (period) - case '፣': // ፣ comma - case '፤': // ፤ semicolon - case '፥': // ፥ colon - case '፦': // ፦ preface colon - case '፧': // ፧ question mark - case '፨': // ፨ paragraph separator - whiteSpaceInFront = true; - whiteSpaceBehind = true; - break; - case '"': - // Toggle whiteSpaceBehind if the character is a quote mark - whiteSpaceBehind = !whiteSpaceBehind; - break; - case '(': - case '[': - case '{': - // Set whiteSpaceBehind to false if the character is an opening parenthesis or bracket - whiteSpaceBehind = false; - break; - default: - break; - } + /** + * This method loads the CoNLL dataset from the given file. + * + * @param file file from which the dataset will be loaded + * @return list of {@link Document} instances that have been loaded. + * @throws GerbilException if there is an IO error while reading the file. + */ + protected List loadDocuments(File file) throws GerbilException { + List documents = new ArrayList(); + // Create namespace for the documents of this dataset + String documentUriPrefix = "http://" + getName() + "/"; + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));) { + String line = reader.readLine(); + // Id of the next document in this file + int index = 0; + List linesOfCurrentDoc = new ArrayList<>(); + // Iterate through all lines until the complete file has been read. + while (line != null) { + // If there is an empty line, the previous document ended and will be added to + // the list of documents + if (line.trim().isEmpty()) { + // If there is a document that can be added + if (linesOfCurrentDoc.size() > 0) { + // Get Markings + StringBuilder currentText = new StringBuilder(); + List markings = processSingleDocument(linesOfCurrentDoc, new StringBuilder()); + // Save the document + documents.add(new DocumentImpl(currentText.toString(), documentUriPrefix + index, markings)); + // Increase the document ID + index++; + } + // Clear the lines for the next document + linesOfCurrentDoc.clear(); + } else { + // Add the current line to the list of lines of the current document + linesOfCurrentDoc.add(line); + } + // Read the next line + line = reader.readLine(); + } + // check if there is a document left that should be added + if (linesOfCurrentDoc.size() > 0) { + // Get Markings + StringBuilder currentText = new StringBuilder(); + List markings = processSingleDocument(linesOfCurrentDoc, currentText); + // Save last document + documents.add(new DocumentImpl(currentText.toString(), documentUriPrefix + index, markings)); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); + } + return documents; + } - } - else if (!Character.isLetterOrDigit(line.charAt(0))) { - // Check if the first character of the line is not a letter or digit - whiteSpaceInFront = false; - // Set whiteSpaceInFront to false if the line starts with a non-letter or non-digit character - } - if (whiteSpaceInFront) { - textOfCurrentDocument.append(' '); - // Append a whitespace to separate the current line from the previous content - } - } - // Add the current line to the list of lines of the current document - linesOfCurrentDoc.add(line); - // Append the current line to the text of the current document - textOfCurrentDocument.append(line); - } - // Read the next line - line = reader.readLine(); - } - // check if there is a document left that should be added - if (linesOfCurrentDoc.size() > 0) { - // Get Markings - List markings = findMarkings(linesOfCurrentDoc, textOfCurrentDocument); - // Save last document - documents.add(new DocumentImpl(textOfCurrentDocument.toString(), documentUriPrefix + index, markings)); - } - } catch (IOException e) { - throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); - } - return documents; - } + /** + * Find markings of document and add document text to the given StringBuilder. + * + * @param linesOfCurrentDoc the lines of the current document + * @param currentText StringBuilder to which the document text should be + * added + * @return The list of {@link Marking} instances found within the document + */ + protected List processSingleDocument(List linesOfCurrentDoc, StringBuilder currentText) { + List markings = new ArrayList(); + int i = 0; + // Flags to track if a whitespace should be inserted in front of and behind a + // line + boolean whiteSpaceInFront = true; + boolean whiteSpaceBehind = true; + // Iterate over the document lines + for (String tokenFull : linesOfCurrentDoc) { + // split the columns + String[] token = tokenFull.split("\t+"); + // If we can parse this line + if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { + // Get the marking from this line (and maybe the next lines) + markings.add(getWholeMarking(linesOfCurrentDoc, i, currentText)); + } + // Check if the line has only one character + if ((currentText.length() > 0) && (token[0].length() >= 1)) { + if (token[0].length() == 1) { + char ch = token[0].charAt(0); // Get the character + switch (ch) { + case '?': // falls through + case '!': + case ',': + case ')': + case ']': + case '}': + case '.': { + whiteSpaceInFront = false; + break; + } + case '"': { + // Toggle whiteSpaceBehind if the character is a quote mark + whiteSpaceBehind = !whiteSpaceBehind; + break; + } + case '(': // falls through + case '[': + case '{': { + whiteSpaceBehind = false; + break; + } + case '፠': // ፠ section mark + case '፡': // ፡ word separator + case '።': // ። full stop (period) + case '፣': // ፣ comma + case '፤': // ፤ semicolon + case '፥': // ፥ colon + case '፦': // ፦ preface colon + case '፧': // ፧ question mark + case '፨': // ፨ paragraph separator + whiteSpaceInFront = true; + whiteSpaceBehind = true; + break; + default: { + break; + } + } - /** - * Find markings of document and add document text to the given StringBuilder. - * - * @param linesOfCurrentDoc the lines of the current document - * @param currentText StringBuilder to which the document text should be - * added - * @return The list of {@link Marking} instances found within the document - */ - protected List findMarkings(List linesOfCurrentDoc, StringBuilder currentText) { - List markings = new ArrayList(); - int i = 0; - // Iterate over the document lines - for (String tokenFull : linesOfCurrentDoc) { - // split the columns - String[] token = tokenFull.split("\t+"); - // If we can parse this line - if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { - // Get the marking from this line (and maybe the next lines) - markings.add(getWholeMarking(linesOfCurrentDoc, i, currentText)); - } - // Add the token from this line to the document's text - // TODO 1. make the whitespace configurable to allow other word separators 2. - // Remove the previous word separator if we have a punctuation character. - // (quotation, apostrophe) - currentText.append(token[0] + " "); - // Increase the line ID - i++; - } - return markings; - } + } else if (!Character.isLetterOrDigit(token[0].charAt(0))) { + // Check if the first character of the line is not a letter or digit + whiteSpaceInFront = false; + // Set whiteSpaceInFront to false if the line starts with a non-letter or + // non-digit character + } + } + // Remove leading/trailing whitespaces and normalize spaces within the token + String normalizedToken = token[0].trim().replaceAll("\\s+", " "); + // Add the token from this line to the document's text + // TODO 1. make the whitespace configurable to allow other word separators 2. + // Remove the previous word separator if we have a punctuation character. + // (quotation, apostrophe) + currentText.append(normalizedToken + " "); + // Increase the line ID + i++; + } + return markings; + } - protected Marking getWholeMarking(List linesOfCurrentDoc, int pos, StringBuilder currentText) { - String[] tokens = linesOfCurrentDoc.get(pos).split("\t"); + protected Marking getWholeMarking(List linesOfCurrentDoc, int pos, StringBuilder currentText) { + String[] tokens = linesOfCurrentDoc.get(pos).split("\t"); - // get type of the marking TODO if the B- and I- are configurable, the - // substring(2) has to be configurable as well. - String type = typeRetriever.getTypeURI(tokens[annotationColumn].substring(2)); + // get type of the marking TODO if the B- and I- are configurable, the + // substring(2) has to be configurable as well. + String type = typeRetriever.getTypeURI(tokens[annotationColumn].substring(2)); - // get uri of the marking if given in the dataset - String uri = null; - if (uriColumn != -1 && tokens[uriColumn].startsWith("http")) { - uri = tokens[uriColumn]; - } + // get uri of the marking if given in the dataset + String uri = null; + if (uriColumn != -1 && tokens[uriColumn].startsWith("http")) { + uri = tokens[uriColumn]; + } - // get surface form of the marking - StringBuilder surfaceForm = new StringBuilder().append(tokens[0]); - for (int i = pos + 1; i < linesOfCurrentDoc.size(); i++) { - tokens = linesOfCurrentDoc.get(i).split("\t"); - if (tokens[annotationColumn].startsWith(MARKING_INSIDE)) { - // TODO 1. make the whitespace configurable to allow other word separators 2. - // Remove the previous word separator if we have a punctuation character. - surfaceForm.append(" ").append(tokens[0]); - } else { - break; - } - } - if (type != null) { - if (uri != null) { - return new TypedNamedEntity(currentText.length(), surfaceForm.length(), uri, - new HashSet(Arrays.asList(type))); - } else { - return new TypedSpanImpl(currentText.length(), surfaceForm.length(), - new HashSet(Arrays.asList(type))); - } - } else { - if (uri != null) { - return new NamedEntity(currentText.length(), surfaceForm.length(), uri); - } else { - LOGGER.warn( - "Found a marked piece of text without any further information: \"{}\". This is either an error in the dataset or this adapter is not correctly configured.", - surfaceForm); - return new SpanImpl(currentText.length(), surfaceForm.length()); - } - } - } + // get surface form of the marking + StringBuilder surfaceForm = new StringBuilder().append(tokens[0]); + for (int i = pos + 1; i < linesOfCurrentDoc.size(); i++) { + tokens = linesOfCurrentDoc.get(i).split("\t"); + if (tokens[annotationColumn].startsWith(MARKING_INSIDE)) { + // TODO 1. make the whitespace configurable to allow other word separators 2. + // Remove the previous word separator if we have a punctuation character. + surfaceForm.append(" ").append(tokens[0]); + } else { + break; + } + } + if (type != null) { + if (uri != null) { + return new TypedNamedEntity(currentText.length(), surfaceForm.length(), uri, + new HashSet(Arrays.asList(type))); + } else { + return new TypedSpanImpl(currentText.length(), surfaceForm.length(), + new HashSet(Arrays.asList(type))); + } + } else { + if (uri != null) { + return new NamedEntity(currentText.length(), surfaceForm.length(), uri); + } else { + LOGGER.warn( + "Found a marked piece of text without any further information: \"{}\". This is either an error in the dataset or this adapter is not correctly configured.", + surfaceForm); + return new SpanImpl(currentText.length(), surfaceForm.length()); + } + } + } } \ No newline at end of file diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java index 74b2d9d0e..86ac142d4 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -4,14 +4,14 @@ import org.aksw.gerbil.dataset.impl.conll.CoNLLTypeRetriever; public class MasakhaNERDataset extends GenericCoNLLDataset { - private static final int ANNOTATION_COLUMN = 2; - private static final int URI_COLUMN = 1; - private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever( - "LOC", null, null, null, "DATE", "PER", null, - null, null, "ORG"); + + private static final int ANNOTATION_COLUMN = 1; + private static final int URI_COLUMN = -1; + private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("LOC", null, null, null, + "DATE", "PER", null, null, null, "ORG"); public MasakhaNERDataset(String file) { - super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); + super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); } } diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index df9877f09..63408c4e0 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -15,126 +15,121 @@ @RunWith(Parameterized.class) public class MasakhaNERDatasetTest extends AbstractGenericCoNLLDatasetTest { - public MasakhaNERDatasetTest(String fileContent, String text, Marking expectedMarking, int documentId, - int markingId) { - super(fileContent, text, expectedMarking, documentId, markingId); - } + public MasakhaNERDatasetTest(String fileContent, String text, Marking expectedMarking, int documentId, + int markingId) { + super(fileContent, text, expectedMarking, documentId, markingId); + } - @Override - public InitializableDataset createDataset(File file) { - return new MasakhaNERDataset(file.getAbsolutePath()); - } + @Override + public InitializableDataset createDataset(File file) { + return new MasakhaNERDataset(file.getAbsolutePath()); + } - @Parameterized.Parameters - public static Collection data() { - List testConfigs = new ArrayList<>(); - // Amharic language - testConfigs.add(new Object[] { - "የጀርመን B-LOC የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", - "የጀርመን የምርጫ ዘመቻን አስመልክቶ ከባልደረባችን ማንተጋፍቶት ስለሺ ጋር ቃለ ምልልስ አድርገናል ፡፡ ", - new TypedSpanImpl(32, 11, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Hausa language - testConfigs.add(new Object[] { - "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", - "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", - new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Igbo language - testConfigs.add(new Object[] { - "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", - "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", - new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Kinyarwanda language - testConfigs.add(new Object[] { - "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", - "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", - new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // Luganda language - testConfigs.add(new Object[] { - "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", - "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", - new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Luo language - testConfigs.add(new Object[] { - "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", - "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", - new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 0, 0 }); - // Nigerian Pidgin language - testConfigs.add(new Object[] { - "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", - "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", - new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // Swahili language - testConfigs.add(new Object[] { - "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", - "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", - new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Wolof language - testConfigs.add(new Object[] { - "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", - "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", - new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Yoruba language - testConfigs.add(new Object[] { - "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", - "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", - new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Bambara language - testConfigs.add(new Object[] { - "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", - "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", - new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Ghomala language - testConfigs.add(new Object[] { - "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", - "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", - new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Ewe language - testConfigs.add(new Object[] { - "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", - "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", - new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Fon language - testConfigs.add(new Object[] { - "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", - "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", - new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Mossi language - testConfigs.add(new Object[] { - "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", - "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", - new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Chichewa language - testConfigs.add(new Object[] { - "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", - "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", - new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // Setswana language - testConfigs.add(new Object[] { - "Zuma B-PER o O ipolela O a O se O molato O. O", - "Zuma o ipolela a se molato.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Twi (Akan/Twi) language - testConfigs.add(new Object[] { - "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", - "Paul resusu sika dodow a ohia na ɔde awie fie no.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // chiShona language - testConfigs.add(new Object[] { - "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", - "Messi ndiye akarova penalty yekutanga akatadza.", - new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // isiXhosa language - testConfigs.add(new Object[] { - "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", - "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", - new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // isiZulu language - testConfigs.add(new Object[] { - "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", - "IMeya yeTheku ingenelela enkingeni yombhikisho", - new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0 }); - - return testConfigs; - } + @Parameterized.Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + // Amharic language + testConfigs.add(new Object[] { + "የጀርመን B-LOC የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", + "የጀርመን : የምርጫ : ዘመቻን : አስመልክቶ : ከባልደረባችን : ማንተጋፍቶት : ስለሺ : ጋር : ቃለ : ምልልስ : አድርገናል ፡፡ ", + new TypedSpanImpl(36, 12, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Hausa language + testConfigs.add(new Object[] { + "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", + "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", + new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Igbo language + testConfigs.add(new Object[] { + "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", + "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", + new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 0, 0 }); + // Kinyarwanda language + testConfigs.add(new Object[] { + "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", + "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", + new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 0, 0 }); + // Luganda language + testConfigs.add(new Object[] { + "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", + "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", + new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 0, 0 }); + // Luo language + testConfigs.add(new Object[] { + "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", + "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", + new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 0, 0 }); + // Nigerian Pidgin language + testConfigs.add(new Object[] { + "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", + "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", + new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 0, 0 }); + // Swahili language + testConfigs.add(new Object[] { + "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", + "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", + new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Wolof language + testConfigs.add(new Object[] { + "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", + "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", + new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Yoruba language + testConfigs.add(new Object[] { + "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", + "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", + new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 0, 0 }); + // Bambara language + testConfigs.add(new Object[] { + "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", + "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", + new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Ghomala language + testConfigs + .add(new Object[] { "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", + "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", + new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Ewe language + testConfigs.add(new Object[] { + "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", + "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", + new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 0, 0 }); + // Fon language + testConfigs.add(new Object[] { + "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", + "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", + new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 0, 0 }); + // Mossi language + testConfigs.add(new Object[] { + "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", + "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", + new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 0, 0 }); + // Chichewa language + testConfigs.add(new Object[] { + "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", + "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", + new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 0, 0 }); + // Setswana language + testConfigs.add(new Object[] { "Zuma B-PER o O ipolela O a O se O molato O. O", "Zuma o ipolela a se molato.", + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // Twi (Akan/Twi) language + testConfigs.add(new Object[] { "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", + "Paul resusu sika dodow a ohia na ɔde awie fie no.", + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // chiShona language + testConfigs.add(new Object[] { "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", + "Messi ndiye akarova penalty yekutanga akatadza.", + new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 0, 0 }); + // isiXhosa language + testConfigs.add(new Object[] { + "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", + "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", + new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 0, 0 }); + // isiZulu language + testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", + "IMeya yeTheku ingenelela enkingeni yombhikisho", + new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0 }); + + return testConfigs; + } } From bfb48dfed76d9adaa2215db49d9c50aaeb0f8e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Tue, 25 Jul 2023 12:17:05 +0200 Subject: [PATCH 16/25] Fixed the generic CoNLL dataset reader together with Neha. --- .../impl/conll/GenericCoNLLDataset.java | 539 ++++++++++-------- .../impl/indqner/IndQNERDatasetTest.java | 11 +- .../dataset/impl/umbc/UMBCDatasetTest.java | 10 +- 3 files changed, 299 insertions(+), 261 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 7de5a7a7f..48537b6a7 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -33,6 +33,7 @@ import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; +import org.aksw.gerbil.transfer.nif.Span; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.aksw.gerbil.transfer.nif.data.SpanImpl; @@ -47,272 +48,304 @@ */ public class GenericCoNLLDataset extends AbstractDataset implements InitializableDataset { - private static final Logger LOGGER = LoggerFactory.getLogger(GenericCoNLLDataset.class); + private static final Logger LOGGER = LoggerFactory.getLogger(GenericCoNLLDataset.class); - /** - * Prefix of a value in the marking column that expresses the start of a - * marking. TODO think about removing the '-' or make it configurable. - */ - protected static final String MARKING_START = "B-"; + /** + * Prefix of a value in the marking column that expresses the start of a + * marking. TODO think about removing the '-' or make it configurable. + */ + protected static final String MARKING_START = "B-"; - /** - * Prefix of a value in the marking column that expresses the continuation of a - * marking. - */ - protected static final String MARKING_INSIDE = "I-"; + /** + * Prefix of a value in the marking column that expresses the continuation of a + * marking. + */ + protected static final String MARKING_INSIDE = "I-"; - /** - * The file from which the data will be loaded. - */ - protected String file; - /** - * The list of documents loaded from the file. - */ - protected List documents; - /** - * Id of the first document. - */ - protected int firstDocId; - /** - * Id of the last document. - */ - protected int lastDocId; - /** - * Class to map markings from the dataset to their type IRI. - */ - protected CoNLLTypeRetriever typeRetriever; - /** - * Id of the column that contains the annotations. - */ - protected int annotationColumn; - /** - * Id of the column that contains the entity's IRI. If there is no such column, - * it is set to -1. - */ - protected int uriColumn; + /** + * The file from which the data will be loaded. + */ + protected String file; + /** + * The list of documents loaded from the file. + */ + protected List documents; + /** + * Id of the first document. + */ + protected int firstDocId; + /** + * Id of the last document. + */ + protected int lastDocId; + /** + * Class to map markings from the dataset to their type IRI. + */ + protected CoNLLTypeRetriever typeRetriever; + /** + * Id of the column that contains the annotations. + */ + protected int annotationColumn; + /** + * Id of the column that contains the entity's IRI. If there is no such column, + * it is set to -1. + */ + protected int uriColumn; - public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { - this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); - } + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { + this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); + } - public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, - String firstDocId, String lastDocId) { - this(file, annotationColumn, uriColumn, typeRetriever, Integer.parseInt(firstDocId), - Integer.parseInt(lastDocId)); - } + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, + String firstDocId, String lastDocId) { + this(file, annotationColumn, uriColumn, typeRetriever, Integer.parseInt(firstDocId), + Integer.parseInt(lastDocId)); + } - public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, - int firstDocId, int lastDocId) { - this.file = file; - this.annotationColumn = annotationColumn; - this.uriColumn = uriColumn; - this.typeRetriever = typeRetriever; - this.firstDocId = firstDocId; - this.lastDocId = lastDocId; - } + public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever, + int firstDocId, int lastDocId) { + this.file = file; + this.annotationColumn = annotationColumn; + this.uriColumn = uriColumn; + this.typeRetriever = typeRetriever; + this.firstDocId = firstDocId; + this.lastDocId = lastDocId; + } - @Override - public int size() { - return documents.size(); - } + @Override + public int size() { + return documents.size(); + } - @Override - public List getInstances() { - return documents; - } + @Override + public List getInstances() { + return documents; + } - @Override - public void init() throws GerbilException { - this.documents = loadDocuments(new File(file)); - if ((firstDocId > 0) && (lastDocId > 0)) { - this.documents = this.documents.subList(firstDocId - 1, lastDocId); - } - } + @Override + public void init() throws GerbilException { + this.documents = loadDocuments(new File(file)); + if ((firstDocId > 0) && (lastDocId > 0)) { + this.documents = this.documents.subList(firstDocId - 1, lastDocId); + } + } - /** - * This method loads the CoNLL dataset from the given file. - * - * @param file file from which the dataset will be loaded - * @return list of {@link Document} instances that have been loaded. - * @throws GerbilException if there is an IO error while reading the file. - */ - protected List loadDocuments(File file) throws GerbilException { - List documents = new ArrayList(); - // Create namespace for the documents of this dataset - String documentUriPrefix = "http://" + getName() + "/"; - try (BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));) { - String line = reader.readLine(); - // Id of the next document in this file - int index = 0; - List linesOfCurrentDoc = new ArrayList<>(); - // Iterate through all lines until the complete file has been read. - while (line != null) { - // If there is an empty line, the previous document ended and will be added to - // the list of documents - if (line.trim().isEmpty()) { - // If there is a document that can be added - if (linesOfCurrentDoc.size() > 0) { - // Get Markings - StringBuilder currentText = new StringBuilder(); - List markings = processSingleDocument(linesOfCurrentDoc, new StringBuilder()); - // Save the document - documents.add(new DocumentImpl(currentText.toString(), documentUriPrefix + index, markings)); - // Increase the document ID - index++; - } - // Clear the lines for the next document - linesOfCurrentDoc.clear(); - } else { - // Add the current line to the list of lines of the current document - linesOfCurrentDoc.add(line); - } - // Read the next line - line = reader.readLine(); - } - // check if there is a document left that should be added - if (linesOfCurrentDoc.size() > 0) { - // Get Markings - StringBuilder currentText = new StringBuilder(); - List markings = processSingleDocument(linesOfCurrentDoc, currentText); - // Save last document - documents.add(new DocumentImpl(currentText.toString(), documentUriPrefix + index, markings)); - } - } catch (IOException e) { - throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); - } - return documents; - } + /** + * This method loads the CoNLL dataset from the given file. + * + * @param file file from which the dataset will be loaded + * @return list of {@link Document} instances that have been loaded. + * @throws GerbilException if there is an IO error while reading the file. + */ + protected List loadDocuments(File file) throws GerbilException { + List documents = new ArrayList(); + // Create namespace for the documents of this dataset + String documentUriPrefix = "http://" + getName() + "/"; + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));) { + String line = reader.readLine(); + // Id of the next document in this file + int index = 0; + List linesOfCurrentDoc = new ArrayList<>(); + // Iterate through all lines until the complete file has been read. + while (line != null) { + // If there is an empty line, the previous document ended and will be added to + // the list of documents + if (line.trim().isEmpty()) { + // If there is a document that can be added + if (linesOfCurrentDoc.size() > 0) { + // Get Markings + StringBuilder currentText = new StringBuilder(); + List markings = processSingleDocument(linesOfCurrentDoc, currentText); + // Save the document + documents.add(new DocumentImpl(currentText.toString(), documentUriPrefix + index, markings)); + // Increase the document ID + index++; + } + // Clear the lines for the next document + linesOfCurrentDoc.clear(); + } else { + // Add the current line to the list of lines of the current document + linesOfCurrentDoc.add(line); + } + // Read the next line + line = reader.readLine(); + } + // check if there is a document left that should be added + if (linesOfCurrentDoc.size() > 0) { + // Get Markings + StringBuilder currentText = new StringBuilder(); + List markings = processSingleDocument(linesOfCurrentDoc, currentText); + // Save last document + documents.add(new DocumentImpl(currentText.toString(), documentUriPrefix + index, markings)); + } + } catch (IOException e) { + throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR); + } + return documents; + } - /** - * Find markings of document and add document text to the given StringBuilder. - * - * @param linesOfCurrentDoc the lines of the current document - * @param currentText StringBuilder to which the document text should be - * added - * @return The list of {@link Marking} instances found within the document - */ - protected List processSingleDocument(List linesOfCurrentDoc, StringBuilder currentText) { - List markings = new ArrayList(); - int i = 0; - // Flags to track if a whitespace should be inserted in front of and behind a - // line - boolean whiteSpaceInFront = true; - boolean whiteSpaceBehind = true; - // Iterate over the document lines - for (String tokenFull : linesOfCurrentDoc) { - // split the columns - String[] token = tokenFull.split("\t+"); - // If we can parse this line - if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { - // Get the marking from this line (and maybe the next lines) - markings.add(getWholeMarking(linesOfCurrentDoc, i, currentText)); - } - // Check if the line has only one character - if ((currentText.length() > 0) && (token[0].length() >= 1)) { - if (token[0].length() == 1) { - char ch = token[0].charAt(0); // Get the character - switch (ch) { - case '?': // falls through - case '!': - case ',': - case ')': - case ']': - case '}': - case '.': { - whiteSpaceInFront = false; - break; - } - case '"': { - // Toggle whiteSpaceBehind if the character is a quote mark - whiteSpaceBehind = !whiteSpaceBehind; - break; - } - case '(': // falls through - case '[': - case '{': { - whiteSpaceBehind = false; - break; - } - case '፠': // ፠ section mark - case '፡': // ፡ word separator - case '።': // ። full stop (period) - case '፣': // ፣ comma - case '፤': // ፤ semicolon - case '፥': // ፥ colon - case '፦': // ፦ preface colon - case '፧': // ፧ question mark - case '፨': // ፨ paragraph separator - whiteSpaceInFront = true; - whiteSpaceBehind = true; - break; - default: { - break; - } - } + /** + * Find markings of document and add document text to the given StringBuilder. + * + * @param linesOfCurrentDoc the lines of the current document + * @param currentText StringBuilder to which the document text should be + * added + * @return The list of {@link Marking} instances found within the document + */ + protected List processSingleDocument(List linesOfCurrentDoc, StringBuilder currentText) { + List markings = new ArrayList(); + Span currentMarking = null; + // Flags to track if a whitespace should be inserted in front of and behind a + // line + boolean whiteSpaceInFront = false; + boolean whiteSpaceBehindPreviousToken = false; + boolean whiteSpaceBehindCurrentToken = false; + boolean sawQuoteBefore = false; + // Iterate over the document lines + for (String tokenFull : linesOfCurrentDoc) { + whiteSpaceBehindPreviousToken = whiteSpaceBehindCurrentToken; + // split the columns + String[] token = tokenFull.split("\t+"); + // Check if the line has only one character + if (token[0].length() == 1) { + char ch = token[0].charAt(0); // Get the character + switch (ch) { + case '?': // falls through + case '!': + case ',': + case ')': + case ']': + case '}': + case ':': + case ';': + case '.': { + whiteSpaceInFront = false; + whiteSpaceBehindCurrentToken = true; + break; + } + // General quotation characters (can be start or end) + // According to https://www.overleaf.com/learn/latex/Typesetting_quotations + case '"': // falls through + case '»': // Start in Danish, end in French, Russian, etc. + case '«': // Start in French, Russian, etc.; end in Danish + case '“': // Start in English, end in German, Lithuanian, Polish + { + // Toggle whiteSpaceBehind if the character is a quote mark + whiteSpaceInFront = !sawQuoteBefore; + whiteSpaceBehindCurrentToken = sawQuoteBefore; + sawQuoteBefore = !sawQuoteBefore; + break; + } + // English, UK ‘…’ + // Start quotation characters + case '„': // Start in German, Lithuanian, Polish + case '‚': // Start in English + { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = false; + sawQuoteBefore = true; + } + // End quotation characters + case '”': // End in a lot of languages + case '‘': // End in English + { + whiteSpaceInFront = false; + whiteSpaceBehindCurrentToken = true; + sawQuoteBefore = false; + } + case '(': // falls through + case '[': + case '{': { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = false; + break; + } + case '፠': // ፠ section mark // falls through + case '፡': // ፡ word separator + case '።': // ። full stop (period) + case '፣': // ፣ comma + case '፤': // ፤ semicolon + case '፥': // ፥ colon + case '፦': // ፦ preface colon + case '፧': // ፧ question mark + case '፨': // ፨ paragraph separator + default: { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = true; + break; + } + } +// } else if (!Character.isLetterOrDigit(token[0].charAt(0))) { +// // Check if the first character of the line is not a letter or digit +// whiteSpaceInFront = false; +// // Set whiteSpaceInFront to false if the line starts with a non-letter or +// // non-digit character + } else { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = true; + } + // Remove leading/trailing whitespaces and normalize spaces within the token + String normalizedToken = token[0].trim().replaceAll("\\s+", " "); - } else if (!Character.isLetterOrDigit(token[0].charAt(0))) { - // Check if the first character of the line is not a letter or digit - whiteSpaceInFront = false; - // Set whiteSpaceInFront to false if the line starts with a non-letter or - // non-digit character - } - } - // Remove leading/trailing whitespaces and normalize spaces within the token - String normalizedToken = token[0].trim().replaceAll("\\s+", " "); - // Add the token from this line to the document's text - // TODO 1. make the whitespace configurable to allow other word separators 2. - // Remove the previous word separator if we have a punctuation character. - // (quotation, apostrophe) - currentText.append(normalizedToken + " "); - // Increase the line ID - i++; - } - return markings; - } + // If the current marking is not null AND there is no annotation column or there + // is no MARKING_INSIDE annotation --> The previous marking ended + if (currentMarking != null + && (token.length <= annotationColumn || !token[annotationColumn].startsWith(MARKING_INSIDE))) { + currentMarking.setLength(currentText.length() - currentMarking.getStartPosition()); + currentMarking = null; + } - protected Marking getWholeMarking(List linesOfCurrentDoc, int pos, StringBuilder currentText) { - String[] tokens = linesOfCurrentDoc.get(pos).split("\t"); + // Add the token from this line to the document's text + if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) { + currentText.append(" "); + } + // If this line contains the start of a marking, we should keep track of it + if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { + // Create new marking + currentMarking = createNewMarking(token, currentText.length()); + markings.add(currentMarking); + } + // TODO 1. make the whitespace configurable to allow other word separators 2. + // Remove the previous word separator if we have a punctuation character. + // (quotation, apostrophe) + currentText.append(normalizedToken); + } + // If there is an unfinished marking, finalize it + if (currentMarking != null) { + currentMarking.setLength(currentText.length() - currentMarking.getStartPosition()); + } + return markings; + } - // get type of the marking TODO if the B- and I- are configurable, the - // substring(2) has to be configurable as well. - String type = typeRetriever.getTypeURI(tokens[annotationColumn].substring(2)); + private Span createNewMarking(String[] line, int startPos) { + // get type of the marking TODO if the B- and I- are configurable, the + // substring(2) has to be configurable as well. + String type = typeRetriever.getTypeURI(line[annotationColumn].substring(2)); - // get uri of the marking if given in the dataset - String uri = null; - if (uriColumn != -1 && tokens[uriColumn].startsWith("http")) { - uri = tokens[uriColumn]; - } - - // get surface form of the marking - StringBuilder surfaceForm = new StringBuilder().append(tokens[0]); - for (int i = pos + 1; i < linesOfCurrentDoc.size(); i++) { - tokens = linesOfCurrentDoc.get(i).split("\t"); - if (tokens[annotationColumn].startsWith(MARKING_INSIDE)) { - // TODO 1. make the whitespace configurable to allow other word separators 2. - // Remove the previous word separator if we have a punctuation character. - surfaceForm.append(" ").append(tokens[0]); - } else { - break; - } - } - if (type != null) { - if (uri != null) { - return new TypedNamedEntity(currentText.length(), surfaceForm.length(), uri, - new HashSet(Arrays.asList(type))); - } else { - return new TypedSpanImpl(currentText.length(), surfaceForm.length(), - new HashSet(Arrays.asList(type))); - } - } else { - if (uri != null) { - return new NamedEntity(currentText.length(), surfaceForm.length(), uri); - } else { - LOGGER.warn( - "Found a marked piece of text without any further information: \"{}\". This is either an error in the dataset or this adapter is not correctly configured.", - surfaceForm); - return new SpanImpl(currentText.length(), surfaceForm.length()); - } - } - } + // get uri of the marking if given in the dataset + String uri = null; + if (uriColumn != -1 && line[uriColumn].startsWith("http")) { + uri = line[uriColumn]; + } + // We set the length of the newly created marking to 0, because we have to + // override it, anyway. If we ever see a 0 outside of this class, we know that + // something went wrong. + if (type != null) { + if (uri != null) { + return new TypedNamedEntity(startPos, 0, uri, new HashSet(Arrays.asList(type))); + } else { + return new TypedSpanImpl(startPos, 0, new HashSet(Arrays.asList(type))); + } + } else { + if (uri != null) { + return new NamedEntity(startPos, 0, uri); + } else { + LOGGER.warn( + "Found a marked piece of text without any further information: \"{}\". This is either an error in the dataset or this adapter is not correctly configured.", + Arrays.toString(line)); + return new SpanImpl(startPos, 0); + } + } + } } \ No newline at end of file diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDatasetTest.java index fe39a226a..27af27fe5 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/indqner/IndQNERDatasetTest.java @@ -30,11 +30,16 @@ public static Collection data() { // test configurations testConfigs.add(new Object[] { "Dia O\nmenarik O\ntangannya O\n, O\ntiba-tiba O\nia O\n( O\ntangan O\nitu O\n) O\nmenjadi O\nputih B-Color\n( O\nbercahaya O\n) O\nbagi O\norang-orang O\nyang O\nmelihat O\n( O\n-nya O\n) O\n. O\n\nPara O\npemuka O\nkaum O\nFir‘aun B-Person\nberkata O\n, O\n“ O\nSesungguhnya O\norang O\nini O\nbenar-benar O\npenyihir O\nyang O\nsangat O\npandai O\n. O\n\nDia O\nhendak O\nmengusir O\nkamu O\ndari O\nnegerimu O\n. O", - "Dia menarik tangannya , tiba-tiba ia ( tangan itu ) menjadi putih ( bercahaya ) bagi orang-orang yang melihat ( -nya ) . ", - new TypedSpanImpl(60, 5, "https://corpus.quran.com/concept.jsp?id=color"), 0, 0 }); + "Dia menarik tangannya, tiba-tiba ia (tangan itu) menjadi putih (bercahaya) bagi orang-orang yang melihat (-nya).", + new TypedSpanImpl(57, 5, "https://corpus.quran.com/concept.jsp?id=color"), 0, 0 }); + /* + * FIXME The following test example has a wrongly positioned “ character which + * is the marking of an end of a quotation, but is handled like a start in this + * example since the algorithm does not know that there was more text before. + */ testConfigs.add(new Object[] { "Dia O\nmenarik O\ntangannya O\n, O\ntiba-tiba O\nia O\n( O\ntangan O\nitu O\n) O\nmenjadi O\nputih B-Color\n( O\nbercahaya O\n) O\nbagi O\norang-orang O\nyang O\nmelihat O\n( O\n-nya O\n) O\n. O\n\nPara O\npemuka O\nkaum O\nFir‘aun B-Person\nberkata O\n, O\n“ O\nSesungguhnya O\norang O\nini O\nbenar-benar O\npenyihir O\nyang O\nsangat O\npandai O\n. O\n\nDia O\nhendak O\nmengusir O\nkamu O\ndari O\nnegerimu O\n. O", - "Para pemuka kaum Fir‘aun berkata , “ Sesungguhnya orang ini benar-benar penyihir yang sangat pandai . ", + "Para pemuka kaum Fir‘aun berkata, “Sesungguhnya orang ini benar-benar penyihir yang sangat pandai.", new TypedSpanImpl(17, 7, "http://dbpedia.org/ontology/Person"), 1, 0 }); return testConfigs; diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDatasetTest.java index 76df4a9cc..ae0c71223 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/umbc/UMBCDatasetTest.java @@ -47,22 +47,22 @@ public static Collection data() { // Simple example with a single document and a single entity testConfigs.add(new Object[] { "Texans O\nurged O\nto O\nflee O\nas O\nIke O\nmenaces O\ncoast O\n: O\nAuthorities O\nhave O\nurged O\nresidents O\nto O\nflee O\nthe O\nTexas B-LOC\ncoast I-LOC\n, O\na O\nURL O", - "Texans urged to flee as Ike menaces coast : Authorities have urged residents to flee the Texas coast , a URL ", - new TypedSpanImpl(89, 11, "http://dbpedia.org/ontology/Place"), 0, 0 }); + "Texans urged to flee as Ike menaces coast: Authorities have urged residents to flee the Texas coast, a URL", + new TypedSpanImpl(88, 11, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Example with 2 documents. We check the first entity in the first document. testConfigs.add(new Object[] { "I'm O\nbored O\nhere O\nin O\nSydney B-LOC\n. O\nI O\nwant O\nto O\ndo O\nsomething O\nanyone O\nwant O\nto O\nwatch O\nmovies O\n? O\n\nI O\ncan O\nfeel O\nthe O\nMobile B-ORG\nWorld I-ORG\nCongress I-ORG\nvibe O\non O\nTwitter B-ORG\nSee O\nyou O\nguys O\nin O\nBarcelona B-LOC\nnext O\nweek O\n. O", - "I'm bored here in Sydney . I want to do something anyone want to watch movies ? ", + "I'm bored here in Sydney. I want to do something anyone want to watch movies?", new TypedSpanImpl(18, 6, "http://dbpedia.org/ontology/Place"), 0, 0 }); // Example with 2 documents. We check the first entity in the second document. testConfigs.add(new Object[] { "I'm O\nbored O\nhere O\nin O\nSydney B-LOC\n. O\nI O\nwant O\nto O\ndo O\nsomething O\nanyone O\nwant O\nto O\nwatch O\nmovies O\n? O\n\nI O\ncan O\nfeel O\nthe O\nMobile B-ORG\nWorld I-ORG\nCongress I-ORG\nvibe O\non O\nTwitter B-ORG\nSee O\nyou O\nguys O\nin O\nBarcelona B-LOC\nnext O\nweek O\n. O", - "I can feel the Mobile World Congress vibe on Twitter See you guys in Barcelona next week . ", + "I can feel the Mobile World Congress vibe on Twitter See you guys in Barcelona next week.", new TypedSpanImpl(15, 21, "http://dbpedia.org/ontology/Organisation"), 1, 0 }); // Example with 2 documents. We check the third entity in the second document. testConfigs.add(new Object[] { "I'm O\nbored O\nhere O\nin O\nSydney B-LOC\n. O\nI O\nwant O\nto O\ndo O\nsomething O\nanyone O\nwant O\nto O\nwatch O\nmovies O\n? O\n\nI O\ncan O\nfeel O\nthe O\nMobile B-ORG\nWorld I-ORG\nCongress I-ORG\nvibe O\non O\nTwitter B-ORG\nSee O\nyou O\nguys O\nin O\nBarcelona B-LOC\nnext O\nweek O\n. O", - "I can feel the Mobile World Congress vibe on Twitter See you guys in Barcelona next week . ", + "I can feel the Mobile World Congress vibe on Twitter See you guys in Barcelona next week.", new TypedSpanImpl(69, 9, "http://dbpedia.org/ontology/Place"), 1, 2 }); return testConfigs; } From ca3d0bfae152c64d0ca2113e0cd284c8252423d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Tue, 25 Jul 2023 12:43:15 +0200 Subject: [PATCH 17/25] Started adapting the generic CoNLL dataset reader to fit to the masakhaNER datasets. --- .../impl/conll/GenericCoNLLDataset.java | 163 ++++++++++++- .../impl/masakha/MasakhaNERDataset.java | 6 +- .../impl/masakha/MasakhaNERDatasetTest.java | 229 +++++++++--------- 3 files changed, 273 insertions(+), 125 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 48537b6a7..356beeb63 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -91,6 +91,18 @@ public class GenericCoNLLDataset extends AbstractDataset implements Initializabl * it is set to -1. */ protected int uriColumn; + /** + * The character used to separate columns. + */ + protected String columnSeparator = "\t"; + /** + * String used to insert white spaces between tokens. + */ + protected String whitespace = " "; + /** + * String inserted between tokens if no white space should be inserted. + */ + protected String nonWhitespace = ""; public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); @@ -206,7 +218,7 @@ protected List processSingleDocument(List linesOfCurrentDoc, St for (String tokenFull : linesOfCurrentDoc) { whiteSpaceBehindPreviousToken = whiteSpaceBehindCurrentToken; // split the columns - String[] token = tokenFull.split("\t+"); + String[] token = tokenFull.split(columnSeparator); // Check if the line has only one character if (token[0].length() == 1) { char ch = token[0].charAt(0); // Get the character @@ -219,7 +231,16 @@ protected List processSingleDocument(List linesOfCurrentDoc, St case '}': case ':': case ';': - case '.': { + case '.': + case '፠': // ፠ section mark // falls through + case '።': // ። full stop (period) + case '፣': // ፣ comma + case '፤': // ፤ semicolon + case '፥': // ፥ colon + case '፦': // ፦ preface colon + case '፧': // ፧ question mark + case '፨': // ፨ paragraph separator + { whiteSpaceInFront = false; whiteSpaceBehindCurrentToken = true; break; @@ -261,15 +282,7 @@ protected List processSingleDocument(List linesOfCurrentDoc, St whiteSpaceBehindCurrentToken = false; break; } - case '፠': // ፠ section mark // falls through case '፡': // ፡ word separator - case '።': // ። full stop (period) - case '፣': // ፣ comma - case '፤': // ፤ semicolon - case '፥': // ፥ colon - case '፦': // ፦ preface colon - case '፧': // ፧ question mark - case '፨': // ፨ paragraph separator default: { whiteSpaceInFront = true; whiteSpaceBehindCurrentToken = true; @@ -298,7 +311,9 @@ protected List processSingleDocument(List linesOfCurrentDoc, St // Add the token from this line to the document's text if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) { - currentText.append(" "); + currentText.append(whitespace); + } else { + currentText.append(nonWhitespace); } // If this line contains the start of a marking, we should keep track of it if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { @@ -348,4 +363,130 @@ private Span createNewMarking(String[] line, int startPos) { } } } + + /** + * @return the file + */ + public String getFile() { + return file; + } + + /** + * @param file the file to set + */ + public void setFile(String file) { + this.file = file; + } + + /** + * @return the firstDocId + */ + public int getFirstDocId() { + return firstDocId; + } + + /** + * @param firstDocId the firstDocId to set + */ + public void setFirstDocId(int firstDocId) { + this.firstDocId = firstDocId; + } + + /** + * @return the lastDocId + */ + public int getLastDocId() { + return lastDocId; + } + + /** + * @param lastDocId the lastDocId to set + */ + public void setLastDocId(int lastDocId) { + this.lastDocId = lastDocId; + } + + /** + * @return the typeRetriever + */ + public CoNLLTypeRetriever getTypeRetriever() { + return typeRetriever; + } + + /** + * @param typeRetriever the typeRetriever to set + */ + public void setTypeRetriever(CoNLLTypeRetriever typeRetriever) { + this.typeRetriever = typeRetriever; + } + + /** + * @return the annotationColumn + */ + public int getAnnotationColumn() { + return annotationColumn; + } + + /** + * @param annotationColumn the annotationColumn to set + */ + public void setAnnotationColumn(int annotationColumn) { + this.annotationColumn = annotationColumn; + } + + /** + * @return the uriColumn + */ + public int getUriColumn() { + return uriColumn; + } + + /** + * @param uriColumn the uriColumn to set + */ + public void setUriColumn(int uriColumn) { + this.uriColumn = uriColumn; + } + + /** + * @return the whitespace + */ + public String getWhitespace() { + return whitespace; + } + + /** + * @param whitespace the whitespace to set + */ + public void setWhitespace(String whitespace) { + this.whitespace = whitespace; + } + + /** + * @return the nonWhitespace + */ + public String getNonWhitespace() { + return nonWhitespace; + } + + /** + * @param nonWhitespace the nonWhitespace to set + */ + public void setNonWhitespace(String nonWhitespace) { + this.nonWhitespace = nonWhitespace; + } + + /** + * @return the columnSeparator + */ + public String getColumnSeparator() { + return columnSeparator; + } + + /** + * @param columnSeparator the columnSeparator to set + */ + public void setColumnSeparator(String columnSeparator) { + this.columnSeparator = columnSeparator; + } } \ No newline at end of file diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java index 86ac142d4..aa4c919d1 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -10,8 +10,12 @@ public class MasakhaNERDataset extends GenericCoNLLDataset { private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("LOC", null, null, null, "DATE", "PER", null, null, null, "ORG"); - public MasakhaNERDataset(String file) { + public MasakhaNERDataset(String file, boolean isAmharic) { super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); + setColumnSeparator(" "); + if(isAmharic) { + setWhitespace(" ፡ "); + } } } diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 63408c4e0..8624a81fb 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -15,121 +15,124 @@ @RunWith(Parameterized.class) public class MasakhaNERDatasetTest extends AbstractGenericCoNLLDatasetTest { - public MasakhaNERDatasetTest(String fileContent, String text, Marking expectedMarking, int documentId, - int markingId) { - super(fileContent, text, expectedMarking, documentId, markingId); - } + private boolean isAmharic = false; - @Override - public InitializableDataset createDataset(File file) { - return new MasakhaNERDataset(file.getAbsolutePath()); - } + public MasakhaNERDatasetTest(String fileContent, String text, Marking expectedMarking, int documentId, + int markingId, boolean isAmharic) { + super(fileContent, text, expectedMarking, documentId, markingId); + this.isAmharic = isAmharic; + } - @Parameterized.Parameters - public static Collection data() { - List testConfigs = new ArrayList(); - // Amharic language - testConfigs.add(new Object[] { - "የጀርመን B-LOC የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", - "የጀርመን : የምርጫ : ዘመቻን : አስመልክቶ : ከባልደረባችን : ማንተጋፍቶት : ስለሺ : ጋር : ቃለ : ምልልስ : አድርገናል ፡፡ ", - new TypedSpanImpl(36, 12, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Hausa language - testConfigs.add(new Object[] { - "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", - "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", - new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Igbo language - testConfigs.add(new Object[] { - "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", - "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", - new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Kinyarwanda language - testConfigs.add(new Object[] { - "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", - "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", - new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // Luganda language - testConfigs.add(new Object[] { - "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", - "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", - new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Luo language - testConfigs.add(new Object[] { - "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", - "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", - new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 0, 0 }); - // Nigerian Pidgin language - testConfigs.add(new Object[] { - "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", - "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", - new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // Swahili language - testConfigs.add(new Object[] { - "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", - "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", - new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Wolof language - testConfigs.add(new Object[] { - "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", - "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", - new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Yoruba language - testConfigs.add(new Object[] { - "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", - "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", - new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Bambara language - testConfigs.add(new Object[] { - "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", - "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", - new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Ghomala language - testConfigs - .add(new Object[] { "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", - "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", - new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Ewe language - testConfigs.add(new Object[] { - "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", - "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", - new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Fon language - testConfigs.add(new Object[] { - "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", - "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", - new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Mossi language - testConfigs.add(new Object[] { - "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", - "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", - new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 0, 0 }); - // Chichewa language - testConfigs.add(new Object[] { - "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", - "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", - new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // Setswana language - testConfigs.add(new Object[] { "Zuma B-PER o O ipolela O a O se O molato O. O", "Zuma o ipolela a se molato.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // Twi (Akan/Twi) language - testConfigs.add(new Object[] { "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", - "Paul resusu sika dodow a ohia na ɔde awie fie no.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // chiShona language - testConfigs.add(new Object[] { "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", - "Messi ndiye akarova penalty yekutanga akatadza.", - new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 0, 0 }); - // isiXhosa language - testConfigs.add(new Object[] { - "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", - "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", - new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 0, 0 }); - // isiZulu language - testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", - "IMeya yeTheku ingenelela enkingeni yombhikisho", - new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0 }); + @Override + public InitializableDataset createDataset(File file) { + return new MasakhaNERDataset(file.getAbsolutePath(), isAmharic); + } - return testConfigs; - } + @Parameterized.Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + // Amharic language + testConfigs.add(new Object[] { + "የጀርመን B-LOC\n የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", + "የጀርመን : የምርጫ : ዘመቻን : አስመልክቶ : ከባልደረባችን : ማንተጋፍቶት : ስለሺ : ጋር : ቃለ : ምልልስ : አድርገናል ፡፡ ", + new TypedSpanImpl(36, 12, "http://dbpedia.org/ontology/Person"), 0, 0, true }); + // Hausa language + testConfigs.add(new Object[] { + "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", + "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", + new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // Igbo language + testConfigs.add(new Object[] { + "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", + "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", + new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + // Kinyarwanda language + testConfigs.add(new Object[] { + "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", + "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", + new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + // Luganda language + testConfigs.add(new Object[] { + "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", + "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", + new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + // Luo language + testConfigs.add(new Object[] { + "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", + "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", + new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 0, 0, false }); + // Nigerian Pidgin language + testConfigs.add(new Object[] { + "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", + "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", + new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + // Swahili language + testConfigs.add(new Object[] { + "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", + "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", + new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // Wolof language + testConfigs.add(new Object[] { + "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", + "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", + new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // Yoruba language + testConfigs.add(new Object[] { + "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", + "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", + new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + // Bambara language + testConfigs.add(new Object[] { + "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", + "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", + new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // Ghomala language + testConfigs + .add(new Object[] { "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", + "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", + new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // Ewe language + testConfigs.add(new Object[] { + "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", + "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", + new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + // Fon language + testConfigs.add(new Object[] { + "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", + "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", + new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + // Mossi language + testConfigs.add(new Object[] { + "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", + "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", + new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + // Chichewa language + testConfigs.add(new Object[] { + "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", + "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", + new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + // Setswana language + testConfigs.add(new Object[] { "Zuma B-PER o O ipolela O a O se O molato O. O", "Zuma o ipolela a se molato.", + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // Twi (Akan/Twi) language + testConfigs.add(new Object[] { "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", + "Paul resusu sika dodow a ohia na ɔde awie fie no.", + new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // chiShona language + testConfigs.add(new Object[] { "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", + "Messi ndiye akarova penalty yekutanga akatadza.", + new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + // isiXhosa language + testConfigs.add(new Object[] { + "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", + "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", + new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + // isiZulu language + testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", + "IMeya yeTheku ingenelela enkingeni yombhikisho", + new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + + return testConfigs; + } } From 5a5f0dd489af1dcb8ed1097c4970cf2d420eb2d3 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Wed, 26 Jul 2023 11:32:52 +0200 Subject: [PATCH 18/25] added few punctuations and fixed the failure of MasakhaNERDatasetTest cases --- .../impl/conll/GenericCoNLLDataset.java | 2 + .../impl/masakha/MasakhaNERDatasetTest.java | 91 ++++++++++--------- 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 356beeb63..61be21e22 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -232,6 +232,8 @@ protected List processSingleDocument(List linesOfCurrentDoc, St case ':': case ';': case '.': + case '#': + case '-': case '፠': // ፠ section mark // falls through case '።': // ። full stop (period) case '፣': // ፣ comma diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 8624a81fb..6145edee8 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -33,102 +33,107 @@ public static Collection data() { List testConfigs = new ArrayList(); // Amharic language testConfigs.add(new Object[] { - "የጀርመን B-LOC\n የምርጫ O ዘመቻን O አስመልክቶ O ከባልደረባችን O ማንተጋፍቶት B-PER ስለሺ I-PER ጋር O ቃለ O ምልልስ O አድርገናል O ፡፡ O", + "የጀርመን B-LOC\nየምርጫ O\nዘመቻን O\nአስመልክቶ O\nከባልደረባችን O\nማንተጋፍቶት B-PER\nስለሺ I-PERO\nጋር O\nቃለ O\nምልልስ O\nአድርገናል O\n፡፡ O", "የጀርመን : የምርጫ : ዘመቻን : አስመልክቶ : ከባልደረባችን : ማንተጋፍቶት : ስለሺ : ጋር : ቃለ : ምልልስ : አድርገናል ፡፡ ", new TypedSpanImpl(36, 12, "http://dbpedia.org/ontology/Person"), 0, 0, true }); // Hausa language testConfigs.add(new Object[] { - "A O\n saurari O\n cikakken O\n rahoton O\n wakilin O\n Muryar B-ORG\n Amurka I-ORG\n Ibrahim B-PER\n Abdul'aziz I-PER", + "A O\nsaurari O\ncikakken O\nrahoton O\nwakilin O\nMuryar B-ORG\nAmurka I-ORG\nIbrahim B-PER\nAbdul'aziz I-PER", "A saurari cikakken rahoton wakilin Muryar Amurka Ibrahim Abdul'aziz", - new TypedSpanImpl(49, 18, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + new TypedSpanImpl(35, 13, "http://dbpedia.org/ontology/Organisation"), 0, 0, false }); // Igbo language testConfigs.add(new Object[] { - "Ike O\n ịda O\n jụụ O\n otụ B-DATE\n nkeji I-DATE\n banyere O\n oke O\n ogbugbu O\n na O\n - O\n eme O\n n'ala O\n Naijiria B-LOC\n agwụla O\n Ekweremmadụ B-PER", - "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na - eme n'ala Naijiria agwụla Ekweremmadụ", - new TypedSpanImpl(57, 8, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + "Ike O\nịda O\njụụ O\notụ B-DATE\nnkeji I-DATE\nbanyere O\noke O\nogbugbu O\nna O\n- O\neme O\nn'ala O\nNaijiria B-LOC\nagwụla O\nEkweremmadụ B-PER", + "Ike ịda jụụ otụ nkeji banyere oke ogbugbu na- eme n'ala Naijiria agwụla Ekweremmadụ", + new TypedSpanImpl(12, 9, "http://dbpedia.org/ontology/Unknown"), 0, 0, false }); // Kinyarwanda language testConfigs.add(new Object[] { - "Ambasaderi O\n wa O\n EU B-ORG\n mu O\n Rwanda B-LOC\n O\n Nicola B-PER\n Bellomo I-PER\n yagize O\n ati O\n O\n Inkunga O\n yacu O\n ni O\n imwe O\n mu O\n nkunga O\n yagutse O\nyiswe O\n # O\n TeamEurope O\n . O", - "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse nyiswe # TeamEurope.", - new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + "Ambasaderi O\nwa O\nEU B-ORG\nmu O\nRwanda B-LOC\nNicola B-PER\nBellomo I-PER\nyagize O\nati O\nInkunga O\nyacu O\nni O\nimwe O\nmu O\nnkunga O\nyagutse O\nyiswe O\n# O\nTeamEurope O\n. O", + "Ambasaderi wa EU mu Rwanda Nicola Bellomo yagize ati Inkunga yacu ni imwe mu nkunga yagutse yiswe# TeamEurope.", + new TypedSpanImpl(14, 2, "http://dbpedia.org/ontology/Organisation"), 0, 0, false }); // Luganda language testConfigs.add(new Object[] { - "Empaka O\n zaakubeera O\n mu O\n kibuga O\n Liverpool B-LOC\n e O\n Bungereza B-LOC\n O\n okutandika O\n nga O\n July B-DATE\n 12 I-DATE\n . O", + "Empaka O\nzaakubeera O\nmu O\nkibuga O\nLiverpool B-LOC\ne O\nBungereza B-LOC\nokutandika O\nnga O\nJuly B-DATE\n12 I-DATE\n. O", "Empaka zaakubeera mu kibuga Liverpool e Bungereza okutandika nga July 12.", new TypedSpanImpl(28, 9, "http://dbpedia.org/ontology/Place"), 0, 0, false }); // Luo language testConfigs.add(new Object[] { - "Migosi O Raila B-PER ne O owuoyo O e O video O mane O ogol O kod O nyare O matin O Winnie B-PER Odinga I-PER", + "Migosi O\nRaila B-PER\nne O\nowuoyo O\ne O\nvideo O\nmane O\nogol O\nkod O\nnyare O\nmatin O\nWinnie B-PER\nOdinga I-PER", "Migosi Raila ne owuoyo e video mane ogol kod nyare matin Winnie Odinga", - new TypedSpanImpl(57, 13, "http://dbpedia.org/ontology/Location"), 0, 0, false }); + new TypedSpanImpl(7, 5, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Nigerian Pidgin language testConfigs.add(new Object[] { - "Mixed B-ORG\n Martial I-ORG\n Arts I-ORG\n joinbodi O\n O\n Ultimate B-ORG\n Fighting I-ORG\n Championship I-ORG\n O\n UFC B-ORG\n don O\n decide O\n say O\n dem O\n go O\n enta O\n back O\n di O\n octagon O\n on O\n Saturday B-DATE\n I-DATE\n 9 I-DATE\n May I-DATE\n O\n for O\n Jacksonville B-LOC\n O", + "Mixed B-ORG\nMartial I-ORG\nArts I-ORG\njoinbodi O\nUltimate B-ORG\nFighting I-ORG\nChampionship I-ORG\nUFC B-ORG\ndon O\ndecide O\nsay O\ndem O\ngo O\nenta O\nback O\ndi O\noctagon O\non O\nSaturday B-DATE\n9 I-DATE\nMay I-DATE\nfor O\nJacksonville B-LOC\nO", "Mixed Martial Arts joinbodi Ultimate Fighting Championship UFC don decide say dem go enta back di octagon on Saturday 9 May for Jacksonville O", - new TypedSpanImpl(1, 62, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + new TypedSpanImpl(0, 18, "http://dbpedia.org/ontology/Organisation"), 0, 0, false }); // Swahili language testConfigs.add(new Object[] { - "Hii O\n ni O\n baada O\n ya O\n rais O\n Yoweri B-PER\n Museveni I-PER\n kuongeza O\n mda O\n wa O\n amri O\n karibu O\n 36 O\n alizotoa O\n katika O\n juhudi O\n za O\n kukabiliana O\n na O\n maambukizi O\n ya O\n Corona O\n nchini O\n humo O\n kwa O\n wiki B-DATE\n tatu I-DATE\n zaidi O\n kuanzia O\n leo O\n jumanne B-DATE\n . O\n", - "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizoto katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", + "Hii O\nni O\nbaada O\nya O\nrais O\nYoweri B-PER\nMuseveni I-PER\nkuongeza O\nmda O\nwa O\namri O\nkaribu O\n36 O\nalizotoa O\nkatika O\njuhudi O\nza O\nkukabiliana O\nna O\nmaambukizi O\nya O\nCorona O\nnchini O\nhumo O\nkwa O\nwiki B-DATE\ntatu I-DATE\nzaidi O\nkuanzia O\nleo O\njumanne B-DATE\n.", + "Hii ni baada ya rais Yoweri Museveni kuongeza mda wa amri karibu 36 alizotoa katika juhudi za kukabiliana na maambukizi ya Corona nchini humo kwa wiki tatu zaidi kuanzia leo jumanne.", new TypedSpanImpl(21, 15, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Wolof language testConfigs.add(new Object[] { - "Tënub O Léwopóol B-PER II I-PER ba O nekk O ca O déngaleereb O ngàngunaay O bu O Burusel B-LOC la O ñu O daax O cuub O bu O xonq O ci O tallata O jee O ci O ngoon O . O", + "Tënub O\nLéwopóol B-PER\nII I-PER\nba O\nnekk O\nca O\ndéngaleereb O\nngàngunaay O\nbu O\nBurusel B-LOC\nla O\nñu O\ndaax O\ncuub O\nbu O\nxonq O\nci O\ntallata O\njee O\nci O\nngoon O\n.", "Tënub Léwopóol II ba nekk ca déngaleereb ngàngunaay bu Burusel la ñu daax cuub bu xonq ci tallata jee ci ngoon.", - new TypedSpanImpl(1, 17, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + new TypedSpanImpl(6, 11, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Yoruba language testConfigs.add(new Object[] { - "Ní O ibi O ìfẹ̀hónúhàn O ní O Luanda B-LOC , O àmì O náà O sọ O pé O “ O 30 O . O 500 O Kwanzas O kì O í O ṣe O kékeré O O . O ", - "Ní ibi ìfẹ̀hónúhàn ní Luanda, àmì náà sọ pé “30.500 Kwanzas kì í ṣe kékeré.", - new TypedSpanImpl(30, 6, "http://dbpedia.org/ontology/Place"), 0, 0, false }); + "A O\nrán O\nWa B-PER\nLone I-PER\nàti O\nKyaw B-PER\nSoe I-PER\nOo I-PER\nsí O\nẹ̀wọ̀n O\nọdún O\nméje O\nfún O\nrírú O\nòfin O\nÌkọ̀kọ̀ O\nsáà O\n- O\nakónilẹ́rú O\n.", + "A rán Wa Lone àti Kyaw Soe Oo sí ẹ̀wọ̀n ọdún méje fún rírú òfin Ìkọ̀kọ̀ sáà- akónilẹ́rú.", + new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Bambara language testConfigs.add(new Object[] { - "Damakasisɛbɛn O ladonna O jumadon B-DATE mɛkalo I-DATE tile I-DATE 28 I-DATE , O Kati B-LOC kiritikɛso O la O . O", + "Damakasisɛbɛn O\nladonna O\njumadon B-DATE\nmɛkalo I-DATE\ntile I-DATE\n28 I-DATE\n, O\nKati B-LOC\nkiritikɛso O\nla O\n.", "Damakasisɛbɛn ladonna jumadon mɛkalo tile 28, Kati kiritikɛso la.", - new TypedSpanImpl(46, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + new TypedSpanImpl(22, 22, "http://dbpedia.org/ontology/Unknown"), 0, 0, false }); // Ghomala language - testConfigs - .add(new Object[] { "Sɔ́ʼ O m O nə́ O cúʼtə O khəkhə O ntʉ́m O kɔŋsɛ̂ O Valserô B-PER Zenît B-ORG", - "Sɔ́ʼ m nə́ cúʼtə khəkhə ntʉ́m kɔŋsɛ̂ Valserô Zenît", - new TypedSpanImpl(37, 7, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + testConfigs.add(new Object[] { + "Brɛ́ndá B-PER\nBiya I-PER\nmú O\nyə O\nmjwǐ O\nFo O\ngúŋ O\nLəpʉə O\nKaməlûm B-LOC\n.", + "Brɛ́ndá Biya mú yə mjwǐ Fo gúŋ Ləpʉə Kaməlûm.", + new TypedSpanImpl(0, 13, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Ewe language testConfigs.add(new Object[] { - "Tsitretsitsi O ɖe O aʋawɔwɔ O ŋu O le O Burkina B-LOC Faso I-LOC : O dziɖuɖua O ɖe O gbeƒã O ame O aɖe O ƒe O lele O . O", - "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso : dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", + "Tsitretsitsi O\nɖe O\naʋawɔwɔ O\nŋu O\nle O\nBurkina B-LOC\nFaso I-LOC\n: O\ndziɖuɖua O\nɖe O\ngbeƒã O\name O\naɖe O\nƒe O\nlele O\n. O", + "Tsitretsitsi ɖe aʋawɔwɔ ŋu le Burkina Faso: dziɖuɖua ɖe gbeƒã ame aɖe ƒe lele.", new TypedSpanImpl(30, 12, "http://dbpedia.org/ontology/Place"), 0, 0, false }); // Fon language testConfigs.add(new Object[] { - "Atinkɛn O nɛ́ O è O è O bló O ɖò O Benɛ B-LOC ɔ́ O O è O gbɛ́ O ɖɔ O è O kún O ná O zán O é O lɔ́ɔ O mɔ̌ O ó O . O", + "Atinkɛn O\nɛ́ O\nè O\nè O\nbló O\nɖò O\nBenɛ B-LOC\nɔ́ O\nè O\ngbɛ́ O\nɖɔ O\nè O\nkún O\nná O\nzán O\né O\nlɔ́ɔ O\nmɔ̌ O\nó O\n. O", "Atinkɛn ɛ́ è è bló ɖò Benɛ ɔ́ è gbɛ́ ɖɔ è kún ná zán é lɔ́ɔ mɔ̌ ó.", new TypedSpanImpl(22, 4, "http://dbpedia.org/ontology/Place"), 0, 0, false }); // Mossi language testConfigs.add(new Object[] { - "Naam O yell O Genon B-LOC soogã O : O talgdbã O 39 O wã O be O bʋ O - O kaoodb O taoore O . O", - "Naam yell Genon soogã : talgdbã 39 wã be bʋ - kaoodb taoore.", + "Naam O\nyell O\nGenon B-LOC\nsoogã O\n: O\ntalgdbã O\n39 O\nwã O\nbe O\nbʋ O\n- O\nkaoodb O\ntaoore O\n. O", + "Naam yell Genon soogã: talgdbã 39 wã be bʋ- kaoodb taoore.", new TypedSpanImpl(10, 5, "http://dbpedia.org/ontology/Place"), 0, 0, false }); // Chichewa language testConfigs.add(new Object[] { - "Ukwati O ndiye O adamanga O pa O 4 B-DATE October I-DATE 2015 I-DATE , O ku O Feed B-ORG the I-ORG Children I-ORG ku O Nyambadwe B-LOC mumzindawu O . O", + "Ukwati O\nndiye O\nadamanga O\npa O\n4 B-DATE\nOctober I-DATE\n2015 I-DATE\n, O\nku O\nFeed B-ORG\nthe I-ORG\nChildren I-ORG\nku O\nNyambadwe B-LOC\nmumzindawu O\n. O", "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", - new TypedSpanImpl(44, 17, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + new TypedSpanImpl(25, 14, "http://dbpedia.org/ontology/Unknown"), 0, 0, false }); // Setswana language - testConfigs.add(new Object[] { "Zuma B-PER o O ipolela O a O se O molato O. O", "Zuma o ipolela a se molato.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + testConfigs.add(new Object[] { + "Zuma B-PER\no O\nipolela O\na O\nse O\nmolato O\n. O", + "Zuma o ipolela a se molato.", + new TypedSpanImpl(0, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Twi (Akan/Twi) language - testConfigs.add(new Object[] { "Paul B-PER resusu O sika O dodow O a O ohia O na O ɔde O awie O fie O no O . O", + testConfigs.add(new Object[] { + "Paul B-PER\nresusu O\nsika O\ndodow O\na O\nohia O\nna O\nɔde O\nawie O\nfie O\nno O\n. O", "Paul resusu sika dodow a ohia na ɔde awie fie no.", - new TypedSpanImpl(1, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + new TypedSpanImpl(0, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // chiShona language - testConfigs.add(new Object[] { "Messi B-PER ndiye O akarova O penalty O yekutanga O akatadza O . O", + testConfigs.add(new Object[] { + "Messi B-PER\nndiye O\nakarova O\npenalty O\nyekutanga O\nakatadza O\n. O", "Messi ndiye akarova penalty yekutanga akatadza.", - new TypedSpanImpl(1, 5, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + new TypedSpanImpl(0, 5, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // isiXhosa language testConfigs.add(new Object[] { - "Ngempazamo O nje O enye O, O iye O yohlwaywa O kabuhlungu O nayo O iRussia B-ORG izolo B-DATE. O", + "Ngempazamo O\nnje O\nenye O\n, O\niye O\nyohlwaywa O\nkabuhlungu O\nnayo O\niRussia B-ORG\nizolo B-DATE\n.", "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", - new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organization"), 0, 0, false }); + new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organisation"), 0, 0, false }); // isiZulu language - testConfigs.add(new Object[] { "IMeya O yeTheku B-LOC ingenelela O enkingeni O yombhikisho O", + testConfigs.add(new Object[] { + "IMeya O\nyeTheku B-LOC\ningenelela O\nenkingeni O\nyombhikisho O", "IMeya yeTheku ingenelela enkingeni yombhikisho", new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0, false }); From 01eb711dc178229b492593cf7d990a608d213cc0 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Wed, 26 Jul 2023 12:17:23 +0200 Subject: [PATCH 19/25] added correct punctuation for word seperator --- .../gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 6145edee8..1e030a407 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -33,8 +33,8 @@ public static Collection data() { List testConfigs = new ArrayList(); // Amharic language testConfigs.add(new Object[] { - "የጀርመን B-LOC\nየምርጫ O\nዘመቻን O\nአስመልክቶ O\nከባልደረባችን O\nማንተጋፍቶት B-PER\nስለሺ I-PERO\nጋር O\nቃለ O\nምልልስ O\nአድርገናል O\n፡፡ O", - "የጀርመን : የምርጫ : ዘመቻን : አስመልክቶ : ከባልደረባችን : ማንተጋፍቶት : ስለሺ : ጋር : ቃለ : ምልልስ : አድርገናል ፡፡ ", + "የጀርመን B-LOC\nየምርጫ O\nዘመቻን O\nአስመልክቶ O\nከባልደረባችን O\nማንተጋፍቶት B-PER\nስለሺ I-PERO\nጋር O\nቃለ O\nምልልስ O\nአድርገናል O\n፡፡", + "የጀርመን ፡ የምርጫ ፡ ዘመቻን ፡ አስመልክቶ ፡ ከባልደረባችን ፡ ማንተጋፍቶት ፡ ስለሺ ፡ ጋር ፡ ቃለ ፡ ምልልስ ፡ አድርገናል፡፡", new TypedSpanImpl(36, 12, "http://dbpedia.org/ontology/Person"), 0, 0, true }); // Hausa language testConfigs.add(new Object[] { From e3506bf6e662d44257130bf1370ec91e654c0c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Wed, 26 Jul 2023 15:50:06 +0200 Subject: [PATCH 20/25] Updated the Amharic test case. Introduced a second test case. --- .../impl/masakha/MasakhaNERDatasetTest.java | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java index 1e030a407..6d715dbf9 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDatasetTest.java @@ -33,9 +33,13 @@ public static Collection data() { List testConfigs = new ArrayList(); // Amharic language testConfigs.add(new Object[] { - "የጀርመን B-LOC\nየምርጫ O\nዘመቻን O\nአስመልክቶ O\nከባልደረባችን O\nማንተጋፍቶት B-PER\nስለሺ I-PERO\nጋር O\nቃለ O\nምልልስ O\nአድርገናል O\n፡፡", - "የጀርመን ፡ የምርጫ ፡ ዘመቻን ፡ አስመልክቶ ፡ ከባልደረባችን ፡ ማንተጋፍቶት ፡ ስለሺ ፡ ጋር ፡ ቃለ ፡ ምልልስ ፡ አድርገናል፡፡", - new TypedSpanImpl(36, 12, "http://dbpedia.org/ontology/Person"), 0, 0, true }); + "የጀርመን B-LOC\nየምርጫ O\nዘመቻን O\nአስመልክቶ O\nከባልደረባችን O\nማንተጋፍቶት B-PER\nስለሺ I-PERO\nጋር O\nቃለ O\nምልልስ O\nአድርገናል O\n።", + "የጀርመን ፡ የምርጫ ፡ ዘመቻን ፡ አስመልክቶ ፡ ከባልደረባችን ፡ ማንተጋፍቶት ፡ ስለሺ ፡ ጋር ፡ ቃለ ፡ ምልልስ ፡ አድርገናል።", + new TypedSpanImpl(0, 5, "http://dbpedia.org/ontology/Place"), 0, 0, true }); + testConfigs.add(new Object[] { + "የጀርመን B-LOC\nየምርጫ O\nዘመቻን O\nአስመልክቶ O\nከባልደረባችን O\nማንተጋፍቶት B-PER\nስለሺ I-PERO\nጋር O\nቃለ O\nምልልስ O\nአድርገናል O\n።", + "የጀርመን ፡ የምርጫ ፡ ዘመቻን ፡ አስመልክቶ ፡ ከባልደረባችን ፡ ማንተጋፍቶት ፡ ስለሺ ፡ ጋር ፡ ቃለ ፡ ምልልስ ፡ አድርገናል።", + new TypedSpanImpl(42, 13, "http://dbpedia.org/ontology/Person"), 0, 1, true }); // Hausa language testConfigs.add(new Object[] { "A O\nsaurari O\ncikakken O\nrahoton O\nwakilin O\nMuryar B-ORG\nAmurka I-ORG\nIbrahim B-PER\nAbdul'aziz I-PER", @@ -80,7 +84,7 @@ public static Collection data() { testConfigs.add(new Object[] { "A O\nrán O\nWa B-PER\nLone I-PER\nàti O\nKyaw B-PER\nSoe I-PER\nOo I-PER\nsí O\nẹ̀wọ̀n O\nọdún O\nméje O\nfún O\nrírú O\nòfin O\nÌkọ̀kọ̀ O\nsáà O\n- O\nakónilẹ́rú O\n.", "A rán Wa Lone àti Kyaw Soe Oo sí ẹ̀wọ̀n ọdún méje fún rírú òfin Ìkọ̀kọ̀ sáà- akónilẹ́rú.", - new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Bambara language testConfigs.add(new Object[] { "Damakasisɛbɛn O\nladonna O\njumadon B-DATE\nmɛkalo I-DATE\ntile I-DATE\n28 I-DATE\n, O\nKati B-LOC\nkiritikɛso O\nla O\n.", @@ -112,18 +116,16 @@ public static Collection data() { "Ukwati ndiye adamanga pa 4 October 2015, ku Feed the Children ku Nyambadwe mumzindawu.", new TypedSpanImpl(25, 14, "http://dbpedia.org/ontology/Unknown"), 0, 0, false }); // Setswana language - testConfigs.add(new Object[] { - "Zuma B-PER\no O\nipolela O\na O\nse O\nmolato O\n. O", - "Zuma o ipolela a se molato.", - new TypedSpanImpl(0, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); + testConfigs.add( + new Object[] { "Zuma B-PER\no O\nipolela O\na O\nse O\nmolato O\n. O", "Zuma o ipolela a se molato.", + new TypedSpanImpl(0, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // Twi (Akan/Twi) language - testConfigs.add(new Object[] { + testConfigs.add(new Object[] { "Paul B-PER\nresusu O\nsika O\ndodow O\na O\nohia O\nna O\nɔde O\nawie O\nfie O\nno O\n. O", "Paul resusu sika dodow a ohia na ɔde awie fie no.", new TypedSpanImpl(0, 4, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // chiShona language - testConfigs.add(new Object[] { - "Messi B-PER\nndiye O\nakarova O\npenalty O\nyekutanga O\nakatadza O\n. O", + testConfigs.add(new Object[] { "Messi B-PER\nndiye O\nakarova O\npenalty O\nyekutanga O\nakatadza O\n. O", "Messi ndiye akarova penalty yekutanga akatadza.", new TypedSpanImpl(0, 5, "http://dbpedia.org/ontology/Person"), 0, 0, false }); // isiXhosa language @@ -132,8 +134,7 @@ public static Collection data() { "Ngempazamo nje enye, iye yohlwaywa kabuhlungu nayo iRussia izolo.", new TypedSpanImpl(51, 7, "http://dbpedia.org/ontology/Organisation"), 0, 0, false }); // isiZulu language - testConfigs.add(new Object[] { - "IMeya O\nyeTheku B-LOC\ningenelela O\nenkingeni O\nyombhikisho O", + testConfigs.add(new Object[] { "IMeya O\nyeTheku B-LOC\ningenelela O\nenkingeni O\nyombhikisho O", "IMeya yeTheku ingenelela enkingeni yombhikisho", new TypedSpanImpl(6, 7, "http://dbpedia.org/ontology/Place"), 0, 0, false }); From 2949ab7868ce282ff731a887bfcaef677f5bb352 Mon Sep 17 00:00:00 2001 From: neha2022 Date: Wed, 26 Jul 2023 19:07:10 +0200 Subject: [PATCH 21/25] fixes test cases --- .../impl/conll/GenericCoNLLDataset.java | 3 ++- .../impl/derczynski/DerczynskiDatasetTest.java | 8 ++++---- .../dataset/impl/ritter/RitterDatasetTest.java | 18 +++++++++--------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 61be21e22..6fb126f80 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -102,7 +102,7 @@ public class GenericCoNLLDataset extends AbstractDataset implements Initializabl /** * String inserted between tokens if no white space should be inserted. */ - protected String nonWhitespace = ""; + protected String nonWhitespace = ""; public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); @@ -234,6 +234,7 @@ protected List processSingleDocument(List linesOfCurrentDoc, St case '.': case '#': case '-': + case '=': case '፠': // ፠ section mark // falls through case '።': // ። full stop (period) case '፣': // ፣ comma diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java index 1ab22310e..d012374fe 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/derczynski/DerczynskiDatasetTest.java @@ -51,19 +51,19 @@ public static Collection data() { List testConfigs = new ArrayList(); testConfigs.add(new Object[] { "#Astros http://dbpedia.org/resource/Houston_Astros B-sportsteam HT\nlineup O NN\nfor O IN\ntonight O NN\n. O 0\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", - "#Astros lineup for tonight . Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ", + "#Astros lineup for tonight. Keppinger sits, Downs plays 2B, CJ bats 5th. @alysonfooter http://bit.ly/bHvgCS", new TypedNamedEntity(0, 7, "http://dbpedia.org/resource/Houston_Astros", new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/SportsTeam"))), 0, 0 }); testConfigs.add(new Object[] { "#Astros http://dbpedia.org/resource/Houston_Astros B-sportsteam HT\nlineup O NN\nfor O IN\ntonight O NN\n. O 0\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", - "#Astros lineup for tonight . Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ", - new TypedNamedEntity(29, 9, "http://dbpedia.org/resource/Jeff_Keppinger", + "#Astros lineup for tonight. Keppinger sits, Downs plays 2B, CJ bats 5th. @alysonfooter http://bit.ly/bHvgCS", + new TypedNamedEntity(28, 9, "http://dbpedia.org/resource/Jeff_Keppinger", new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Person"))), 0, 1 }); testConfigs.add(new Object[] { "#Astros O B-sportsteam HT\nlineup O I-sportsteam NN\nfor O IN\ntonight O NN\n. O 0\nJeff http://dbpedia.org/resource/Jeff_Keppinger B-person NNP\nKeppinger http://dbpedia.org/resource/Jeff_Keppinger I-person NNP\nsits O VBZ\n, O ,\nDowns http://dbpedia.org/resource/Brodie_Downs B-person NNP\nplays O VBZ\n2B O NN\n, O ,\nCJ NIL B-person NNP\nbats O VBZ\n5th O JJ\n. O 0\n@alysonfooter O USR\nhttp://bit.ly/bHvgCS O URL", - "#Astros lineup for tonight . Jeff Keppinger sits , Downs plays 2B , CJ bats 5th . @alysonfooter http://bit.ly/bHvgCS ", + "#Astros lineup for tonight. Jeff Keppinger sits, Downs plays 2B, CJ bats 5th. @alysonfooter http://bit.ly/bHvgCS", new TypedSpanImpl(0, 14, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/SportsTeam"))), 0, 0 }); return testConfigs; diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java index 02178c122..28d830d04 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java @@ -47,17 +47,17 @@ public InitializableDataset createDataset(File file) { public static Collection data() { List testConfigs = new ArrayList(); testConfigs.add(new Object[] { - "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening . ", - new TypedSpanImpl(64, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 }); + "@paulwalk O\nIt's O\nthe O\nview O\nfrom O\nwhere O\nI'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", + "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", + new TypedSpanImpl(61, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 }); testConfigs.add(new Object[] { - "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening . ", - new TypedSpanImpl(88, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 }); + "@paulwalk O\nIt's O\nthe O\nview O\nfrom O\nwhere O\nI'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", + "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", + new TypedSpanImpl(84, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 }); testConfigs.add(new Object[] { - "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "From Green Newsfeed : AHFA extends deadline for Sage Award to Nov . 5 http://tinyurl.com/24agj38 ", - new TypedSpanImpl(22, 4, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Unknown"))), 1, 0 }); + "@paulwalk O\nIt's O\nthe O\nview O\nfrom O\nwhere O\nI'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", + "From Green Newsfeed: AHFA extends deadline for Sage Award to Nov. 5 http://tinyurl.com/24agj38", + new TypedSpanImpl(21, 4, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Unknown"))), 1, 0 }); return testConfigs; } From 6d11c0b6c2edcbef63df924dfecccb63ca6bdd03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Thu, 25 Jan 2024 17:31:41 +0100 Subject: [PATCH 22/25] Rolled back changes in the Ritter dataset test to ensure that missing features are still part of the text. Added a comment to the test to point to the missing feature. --- .../dataset/impl/ritter/RitterDatasetTest.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java index 28d830d04..45f156590 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java @@ -46,16 +46,18 @@ public InitializableDataset createDataset(File file) { @Parameters public static Collection data() { List testConfigs = new ArrayList(); + // TODO: The problem of leading white spaces in front of apostrophes needs a + // solution (e.g. "I 'm") testConfigs.add(new Object[] { - "@paulwalk O\nIt's O\nthe O\nview O\nfrom O\nwhere O\nI'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", - new TypedSpanImpl(61, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 }); + "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", + "@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", + new TypedSpanImpl(63, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 }); testConfigs.add(new Object[] { - "@paulwalk O\nIt's O\nthe O\nview O\nfrom O\nwhere O\nI'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", - new TypedSpanImpl(84, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 }); + "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", + "@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", + new TypedSpanImpl(86, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 }); testConfigs.add(new Object[] { - "@paulwalk O\nIt's O\nthe O\nview O\nfrom O\nwhere O\nI'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", + "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", "From Green Newsfeed: AHFA extends deadline for Sage Award to Nov. 5 http://tinyurl.com/24agj38", new TypedSpanImpl(21, 4, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Unknown"))), 1, 0 }); return testConfigs; From f8200e3c21a33ef924bdce969b03492a7dcf0956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Thu, 25 Jan 2024 17:32:04 +0100 Subject: [PATCH 23/25] Added comments to classes. --- .../impl/conll/GenericCoNLLDataset.java | 16 +++++++++-- .../impl/masakha/MasakhaNERDataset.java | 27 ++++++++++++------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index 6fb126f80..b4e5eb38a 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -102,7 +102,7 @@ public class GenericCoNLLDataset extends AbstractDataset implements Initializabl /** * String inserted between tokens if no white space should be inserted. */ - protected String nonWhitespace = ""; + protected String nonWhitespace = ""; public GenericCoNLLDataset(String file, int annotationColumn, int uriColumn, CoNLLTypeRetriever typeRetriever) { this(file, annotationColumn, uriColumn, typeRetriever, -1, -1); @@ -336,7 +336,19 @@ protected List processSingleDocument(List linesOfCurrentDoc, St return markings; } - private Span createNewMarking(String[] line, int startPos) { + /** + * Generates a {@link Marking} that is at least an implementation of the + * {@link Span} interface or even more, depending on the data available in the + * given. Note that the {@link Span} instances created by this method have the + * length 0. Their final length is set outside of this method. + * + * @param line the current line of the CoNLL file + * @param startPos the start position of the {@link Span}, i.e., the position of + * this line within the text + * @return A {@link Span} instance which already contains nearly all information + * about the {@link Marking}, except its length + */ + protected Span createNewMarking(String[] line, int startPos) { // get type of the marking TODO if the B- and I- are configurable, the // substring(2) has to be configurable as well. String type = typeRetriever.getTypeURI(line[annotationColumn].substring(2)); diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java index aa4c919d1..8d5763aa2 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/masakha/MasakhaNERDataset.java @@ -3,19 +3,26 @@ import org.aksw.gerbil.dataset.impl.conll.GenericCoNLLDataset; import org.aksw.gerbil.dataset.impl.conll.CoNLLTypeRetriever; +/** + * An extension of the {@link GenericCoNLLDataset} class that can handle + * datasets of the MasakhaNER dataset collection. + * + * @author Neha Pokharel + * @author Michael Röder (michael.roeder@uni-paderborn.de) + * + */ public class MasakhaNERDataset extends GenericCoNLLDataset { - - private static final int ANNOTATION_COLUMN = 1; + + private static final int ANNOTATION_COLUMN = 1; private static final int URI_COLUMN = -1; - private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("LOC", null, null, null, - "DATE", "PER", null, null, null, "ORG"); + private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("LOC", null, null, null, "DATE", "PER", + null, null, null, "ORG"); public MasakhaNERDataset(String file, boolean isAmharic) { - super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); - setColumnSeparator(" "); - if(isAmharic) { - setWhitespace(" ፡ "); - } + super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS); + setColumnSeparator(" "); + if (isAmharic) { + setWhitespace(" ፡ "); + } } } - From 9e8c9799cb48e8512480060087e09d46920fd56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Thu, 25 Jan 2024 18:16:24 +0100 Subject: [PATCH 24/25] Updated Masakha dataset definitions. --- src/main/properties/datasets.properties | 530 +++++++++++++++--------- 1 file changed, 329 insertions(+), 201 deletions(-) diff --git a/src/main/properties/datasets.properties b/src/main/properties/datasets.properties index 76f7a3b47..dda43e499 100644 --- a/src/main/properties/datasets.properties +++ b/src/main/properties/datasets.properties @@ -313,347 +313,475 @@ org.aksw.gerbil.datasets.definition.KORE50.cacheable=true org.aksw.gerbil.datasets.definition.KORE50.experimentType=A2KB org.aksw.gerbil.datasets.definition.KORE50.constructorArgs=${org.aksw.gerbil.datasets.KORE50.file},${org.aksw.gerbil.datasets.definition.KORE50.name} - - ### Masakha org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir=${org.aksw.gerbil.DataPath}/datasets/masakha ### Amharic -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.name=MasakhaNER-Amharic-Dev +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.name=MasakhaNER Amharic Dev org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/dev.txt +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.constructorArgs=$s/amh/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Amharic-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.name=MasakhaNER-Amharic-Test +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.name=MasakhaNER Amharic Test org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/test.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Amharic-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.name=MasakhaNER-Amharic-Train +org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.name=MasakhaNER Amharic Train org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/amh/train.txt org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Amharic-Train.experimentType=RT2KB ### Hausa -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.name=MasakhaNER-Hausa-Dev +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.name=MasakhaNER Hausa Dev org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Hausa-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.name=MasakhaNER-Hausa-Test +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.name=MasakhaNER Hausa Test org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/test.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Hausa-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.name=MasakhaNER-Hausa-Train +org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.name=MasakhaNER Hausa Train org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/hau/train.txt org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Hausa-Train.experimentType=RT2KB ### Igbo -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.name=MasakhaNER-Igbo-Dev +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.name=MasakhaNER Igbo Dev org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Igbo-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.name=MasakhaNER-Igbo-Test +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.name=MasakhaNER Igbo Test org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/test.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Igbo-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.name=MasakhaNER-Igbo-Train +org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.name=MasakhaNER Igbo Train org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ibo/train.txt org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Igbo-Train.experimentType=RT2KB ### Kinyarwanda -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.name=MasakhaNER-Kinyarwanda-Dev +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.name=MasakhaNER Kinyarwanda Dev org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.name=MasakhaNER-Kinyarwanda-Test +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.name=MasakhaNER Kinyarwanda Test org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/test.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.name=MasakhaNER-Kinyarwanda-Train +org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.name=MasakhaNER Kinyarwanda Train org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/kin/train.txt org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Kinyarwanda-Train.experimentType=RT2KB ### Luganda -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.name=MasakhaNER-Luganda-Dev +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.name=MasakhaNER Luganda Dev org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Luganda-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.name=MasakhaNER-Luganda-Test +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.name=MasakhaNER Luganda Test org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/test.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Luganda-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.name=MasakhaNER-Luganda-Train +org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.name=MasakhaNER Luganda Train org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/lug/train.txt org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Luganda-Train.experimentType=RT2KB ### Luo -org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.name=MasakhaNER-Luo-Dev +org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.name=MasakhaNER Luo Dev org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Luo-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.name=MasakhaNER-Luo-Test +org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.name=MasakhaNER Luo Test org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/test.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Luo-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.name=MasakhaNER-Luo-Train +org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.name=MasakhaNER Luo Train org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/luo/train.txt org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Luo-Train.experimentType=RT2KB ### Naija -org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.name=MasakhaNER-Naija-Dev +org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.name=MasakhaNER Nigerian Pidgin Dev org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Naija-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.name=MasakhaNER-Naija-Test +org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.name=MasakhaNER Nigerian Pidgin Test org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/test.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Naija-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.name=MasakhaNER-Naija-Train +org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.name=MasakhaNER Nigerian Pidgin Train org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/pcm/train.txt org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Naija-Train.experimentType=RT2KB ### Swahili -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.name=MasakhaNER-Swahili-Dev +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.name=MasakhaNER Swahili Dev org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Swahili-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.name=MasakhaNER-Swahili-Test +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.name=MasakhaNER Swahili Test org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/test.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Swahili-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.name=MasakhaNER-Swahili-Train +org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.name=MasakhaNER Swahili Train org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/swa/train.txt org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Swahili-Train.experimentType=RT2KB ### Wolof -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.name=MasakhaNER-Wolof-Dev +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.name=MasakhaNER Wolof Dev org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Wolof-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.name=MasakhaNER-Wolof-Test +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.name=MasakhaNER Wolof Test org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/test.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Wolof-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.name=MasakhaNER-Wolof-Train +org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.name=MasakhaNER Wolof Train org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/wol/train.txt org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Wolof-Train.experimentType=RT2KB ### Yoruba -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.name=MasakhaNER-Yoruba-Dev +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.name=MasakhaNER Yoruba Dev org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/dev.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.name=MasakhaNER-Yoruba-Test +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.name=MasakhaNER Yoruba Test org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/test.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.name=MasakhaNER-Yoruba-Train +org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.name=MasakhaNER Yoruba Train org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/yor/train.txt org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.cacheable=true org.aksw.gerbil.datasets.definition.Masakha-Yoruba-Train.experimentType=RT2KB + +### Masakha 2 +org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir=${org.aksw.gerbil.DataPath}/datasets/masakha2 ### Bambara language -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.name=MasakhaNER-Bambara-Dev -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.name=MasakhaNER-Bambara-Test -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.name=MasakhaNER-Bambara-Train -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bam/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Bambara-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Dev.name=MasakhaNER 2.0 Bambara Dev +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/bam/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Test.name=MasakhaNER 2.0 Bambara Test +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/bam/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Train.name=MasakhaNER 2.0 Bambara Train +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/bam/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Bambara-Train.experimentType=RT2KB +### Ghomala language +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Dev.name=MasakhaNER 2.0 Ghomala Dev +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/bbj/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Test.name=MasakhaNER 2.0 Ghomala Test +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/bbj/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Train.name=MasakhaNER 2.0 Ghomala Train +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/bbj/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Ghomala-Train.experimentType=RT2KB ### Ewe language -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.name=MasakhaNER-Ewe-Dev -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.name=MasakhaNER-Ewe-Test -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.name=MasakhaNER-Ewe-Train -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/ewe/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ewe-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Dev.name=MasakhaNER 2.0 Ewe Dev +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/ewe/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Test.name=MasakhaNER 2.0 Ewe Test +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/ewe/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Train.name=MasakhaNER 2.0 Ewe Train +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/ewe/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Ewe-Train.experimentType=RT2KB ### Fon language -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.name=MasakhaNER-Fon-Dev -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.name=MasakhaNER-Fon-Test -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.name=MasakhaNER-Fon-Train -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/fon/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Fon-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Dev.name=MasakhaNER 2.0 Fon Dev +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/fon/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Test.name=MasakhaNER 2.0 Fon Test +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/fon/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Train.name=MasakhaNER 2.0 Fon Train +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/fon/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Fon-Train.experimentType=RT2KB +### Hausa +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Dev.name=MasakhaNER 2.0 Hausa Dev +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/hau/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Test.name=MasakhaNER 2.0 Hausa Test +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/hau/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Train.name=MasakhaNER 2.0 Hausa Train +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/hau/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Hausa-Train.experimentType=RT2KB +### Igbo +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Dev.name=MasakhaNER 2.0 Igbo Dev +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/ibo/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Test.name=MasakhaNER 2.0 Igbo Test +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/ibo/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Train.name=MasakhaNER 2.0 Igbo Train +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/ibo/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Igbo-Train.experimentType=RT2KB +### Kinyarwanda +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Dev.name=MasakhaNER 2.0 Kinyarwanda Dev +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/kin/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Test.name=MasakhaNER 2.0 Kinyarwanda Test +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/kin/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Train.name=MasakhaNER 2.0 Kinyarwanda Train +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/kin/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Kinyarwanda-Train.experimentType=RT2KB +### Luganda +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Dev.name=MasakhaNER 2.0 Luganda Dev +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/lug/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Test.name=MasakhaNER 2.0 Luganda Test +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/lug/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Train.name=MasakhaNER 2.0 Luganda Train +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/lug/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Luganda-Train.experimentType=RT2KB +### Luo +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Dev.name=MasakhaNER 2.0 Luo Dev +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/luo/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Test.name=MasakhaNER 2.0 Luo Test +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/luo/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Train.name=MasakhaNER 2.0 Luo Train +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/luo/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Luo-Train.experimentType=RT2KB ### Mossi language -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.name=MasakhaNER-Mossi-Dev -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.name=MasakhaNER-Mossi-Test -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.name=MasakhaNER-Mossi-Train -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/mos/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Mossi-Train.experimentType=RT2KB -### Ghomala language -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.name=MasakhaNER-Ghomala-Dev -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.name=MasakhaNER-Ghomala-Test -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.name=MasakhaNER-Ghomala-Train -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/bbj/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Ghomala-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Dev.name=MasakhaNER 2.0 Mossi Dev +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/mos/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Test.name=MasakhaNER 2.0 Mossi Test +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/mos/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Train.name=MasakhaNER 2.0 Mossi Train +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/mos/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Mossi-Train.experimentType=RT2KB +### Naija +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Dev.name=MasakhaNER 2.0 Nigerian Pidgin Dev +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/pcm/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Test.name=MasakhaNER 2.0 Nigerian Pidgin Test +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/pcm/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Train.name=MasakhaNER 2.0 Nigerian Pidgin Train +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/pcm/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Naija-Train.experimentType=RT2KB ### Chichewa language -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.name=MasakhaNER-Chichewa-Dev -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.name=MasakhaNER-Chichewa-Test -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.name=MasakhaNER-Chichewa-Train -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/nya/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Chichewa-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Dev.name=MasakhaNER 2.0 Chichewa Dev +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/nya/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Test.name=MasakhaNER 2.0 Chichewa Test +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/nya/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Train.name=MasakhaNER 2.0 Chichewa Train +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/nya/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Chichewa-Train.experimentType=RT2KB +### chiShona language +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Dev.name=MasakhaNER 2.0 chiShona Dev +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/sna/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Test.name=MasakhaNER 2.0 chiShona Test +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/sna/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Train.name=MasakhaNER 2.0 chiShona Train +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/sna/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-chiShona-Train.experimentType=RT2KB ### Setswana language -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.name=MasakhaNER-Setswana-Dev -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.name=MasakhaNER-Setswana-Test -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.name=MasakhaNER-Setswana-Train -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/tsn/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Setswana-Train.experimentType=RT2KB - +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Dev.name=MasakhaNER 2.0 Setswana Dev +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/tsn/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Test.name=MasakhaNER 2.0 Setswana Test +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/tsn/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Train.name=MasakhaNER 2.0 Setswana Train +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/tsn/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Setswana-Train.experimentType=RT2KB ### (Akan/Twi) language -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.name=MasakhaNER-Twi-Dev -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.name=MasakhaNER-Twi-Test -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/test.txt -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.name=MasakhaNER-Twi-Train -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/twi/train.txt -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-Twi-Train.experimentType=RT2KB -### chiShona language -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.name=MasakhaNER-chiShona-Dev -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.name=MasakhaNER-chiShona-Test -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/test.txt -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.name=MasakhaNER-chiShona-Train -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/sna/train.txt -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-chiShona-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Dev.name=MasakhaNER 2.0 Akan/Twi Dev +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/twi/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Test.name=MasakhaNER 2.0 Akan/Twi Test +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/twi/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Train.name=MasakhaNER 2.0 Akan/Twi Train +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/twi/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Twi-Train.experimentType=RT2KB +### Wolof +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Dev.name=MasakhaNER 2.0 Wolof Dev +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/wol/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Test.name=MasakhaNER 2.0 Wolof Test +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/wol/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Train.name=MasakhaNER 2.0 Wolof Train +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/wol/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Wolof-Train.experimentType=RT2KB ### isiXhosa language -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.name=MasakhaNER-isiXhosa-Dev -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.name=MasakhaNER-isiXhosa-Test -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/test.txt -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.name=MasakhaNER-isiXhosa-Train -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/xho/train.txt -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiXhosa-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Dev.name=MasakhaNER 2.0 isiXhosa Dev +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/xho/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Test.name=MasakhaNER 2.0 isiXhosa Test +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/xho/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Train.name=MasakhaNER 2.0 isiXhosa Train +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/xho/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-isiXhosa-Train.experimentType=RT2KB +### Yoruba +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Dev.name=MasakhaNER 2.0 Yoruba Dev +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/yor/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Test.name=MasakhaNER 2.0 Yoruba Test +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/yor/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Train.name=MasakhaNER 2.0 Yoruba Train +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/yor/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-Yoruba-Train.experimentType=RT2KB ### isiZulu language -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.name=MasakhaNER-isiZulu-Dev -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/dev.txt -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Dev.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.name=MasakhaNER-isiZulu-Test -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/test.txt -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Test.experimentType=RT2KB -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.name=MasakhaNER-isiZulu-Train -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.constructorArgs=${org.aksw.gerbil.datasets.masakhaDatasetConfiguration.datasetdir}/zul/train.txt -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.cacheable=true -org.aksw.gerbil.datasets.definition.Masakha-isiZulu-Train.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Dev.name=MasakhaNER 2.0 isiZulu Dev +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Dev.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Dev.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/zul/dev.txt +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Dev.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Dev.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Test.name=MasakhaNER 2.0 isiZulu Test +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Test.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Test.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/zul/test.txt +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Test.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Test.experimentType=RT2KB +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Train.name=MasakhaNER 2.0 isiZulu Train +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Train.class=org.aksw.gerbil.dataset.impl.masakha.MasakhaNERDataset +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Train.constructorArgs=${org.aksw.gerbil.datasets.masakha2DatasetConfiguration.datasetdir}/zul/train.txt +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Train.cacheable=true +org.aksw.gerbil.datasets.definition.Masakha2-isiZulu-Train.experimentType=RT2KB ### Meij #org.aksw.gerbil.datasets.MeijDatasetConfig.tweetsFile=${org.aksw.gerbil.DataPath}/datasets/meij/original_tweets.list From 55e481e7dfcd2f20242d1363c497e4ece282b718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20R=C3=B6der?= Date: Thu, 25 Jan 2024 18:52:49 +0100 Subject: [PATCH 25/25] Added a work around for the problem with apostrophes in the CoNLL format. --- .../impl/conll/GenericCoNLLDataset.java | 218 +++++++++--------- .../impl/ritter/RitterDatasetTest.java | 8 +- 2 files changed, 115 insertions(+), 111 deletions(-) diff --git a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java index b4e5eb38a..6cd2f8848 100644 --- a/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java +++ b/src/main/java/org/aksw/gerbil/dataset/impl/conll/GenericCoNLLDataset.java @@ -219,115 +219,119 @@ protected List processSingleDocument(List linesOfCurrentDoc, St whiteSpaceBehindPreviousToken = whiteSpaceBehindCurrentToken; // split the columns String[] token = tokenFull.split(columnSeparator); - // Check if the line has only one character - if (token[0].length() == 1) { - char ch = token[0].charAt(0); // Get the character - switch (ch) { - case '?': // falls through - case '!': - case ',': - case ')': - case ']': - case '}': - case ':': - case ';': - case '.': - case '#': - case '-': - case '=': - case '፠': // ፠ section mark // falls through - case '።': // ። full stop (period) - case '፣': // ፣ comma - case '፤': // ፤ semicolon - case '፥': // ፥ colon - case '፦': // ፦ preface colon - case '፧': // ፧ question mark - case '፨': // ፨ paragraph separator - { - whiteSpaceInFront = false; - whiteSpaceBehindCurrentToken = true; - break; - } - // General quotation characters (can be start or end) - // According to https://www.overleaf.com/learn/latex/Typesetting_quotations - case '"': // falls through - case '»': // Start in Danish, end in French, Russian, etc. - case '«': // Start in French, Russian, etc.; end in Danish - case '“': // Start in English, end in German, Lithuanian, Polish - { - // Toggle whiteSpaceBehind if the character is a quote mark - whiteSpaceInFront = !sawQuoteBefore; - whiteSpaceBehindCurrentToken = sawQuoteBefore; - sawQuoteBefore = !sawQuoteBefore; - break; - } - // English, UK ‘…’ - // Start quotation characters - case '„': // Start in German, Lithuanian, Polish - case '‚': // Start in English - { - whiteSpaceInFront = true; - whiteSpaceBehindCurrentToken = false; - sawQuoteBefore = true; - } - // End quotation characters - case '”': // End in a lot of languages - case '‘': // End in English - { - whiteSpaceInFront = false; - whiteSpaceBehindCurrentToken = true; - sawQuoteBefore = false; - } - case '(': // falls through - case '[': - case '{': { - whiteSpaceInFront = true; - whiteSpaceBehindCurrentToken = false; - break; - } - case '፡': // ፡ word separator - default: { - whiteSpaceInFront = true; - whiteSpaceBehindCurrentToken = true; - break; - } - } -// } else if (!Character.isLetterOrDigit(token[0].charAt(0))) { -// // Check if the first character of the line is not a letter or digit -// whiteSpaceInFront = false; -// // Set whiteSpaceInFront to false if the line starts with a non-letter or -// // non-digit character - } else { - whiteSpaceInFront = true; - whiteSpaceBehindCurrentToken = true; - } - // Remove leading/trailing whitespaces and normalize spaces within the token - String normalizedToken = token[0].trim().replaceAll("\\s+", " "); - - // If the current marking is not null AND there is no annotation column or there - // is no MARKING_INSIDE annotation --> The previous marking ended - if (currentMarking != null - && (token.length <= annotationColumn || !token[annotationColumn].startsWith(MARKING_INSIDE))) { - currentMarking.setLength(currentText.length() - currentMarking.getStartPosition()); - currentMarking = null; - } + // Ensure that there are tokens in the line and that the first token is not + // empty + if ((token.length > 0) && (token[0].length() > 0)) { + // Remove leading/trailing whitespaces and normalize spaces within the token + String normalizedToken = token[0].trim().replaceAll("\\s+", " "); + // Check again that the token is not empty + if (!normalizedToken.isEmpty()) { + // Check if the line has only one character + if (normalizedToken.length() == 1) { + char ch = normalizedToken.charAt(0); // Get the character + switch (ch) { + case '?': // falls through + case '!': + case ',': + case ')': + case ']': + case '}': + case ':': + case ';': + case '.': + case '#': + case '-': + case '=': + case '፠': // ፠ section mark + case '።': // ። full stop (period) + case '፣': // ፣ comma + case '፤': // ፤ semicolon + case '፥': // ፥ colon + case '፦': // ፦ preface colon + case '፧': // ፧ question mark + case '፨': // ፨ paragraph separator + { + whiteSpaceInFront = false; + whiteSpaceBehindCurrentToken = true; + break; + } + // General quotation characters (can be start or end) + // According to https://www.overleaf.com/learn/latex/Typesetting_quotations + case '"': // falls through + case '»': // Start in Danish, end in French, Russian, etc. + case '«': // Start in French, Russian, etc.; end in Danish + case '“': // Start in English, end in German, Lithuanian, Polish + { + // Toggle whiteSpaceBehind if the character is a quote mark + whiteSpaceInFront = !sawQuoteBefore; + whiteSpaceBehindCurrentToken = sawQuoteBefore; + sawQuoteBefore = !sawQuoteBefore; + break; + } + // English, UK ‘…’ + // Start quotation characters + case '„': // Start in German, Lithuanian, Polish + case '‚': // Start in English + { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = false; + sawQuoteBefore = true; + } + // End quotation characters + case '”': // End in a lot of languages + case '‘': // End in English + { + whiteSpaceInFront = false; + whiteSpaceBehindCurrentToken = true; + sawQuoteBefore = false; + } + case '(': // falls through + case '[': + case '{': { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = false; + break; + } + case '፡': // ፡ word separator + default: { + whiteSpaceInFront = true; + whiteSpaceBehindCurrentToken = true; + break; + } + } + } else { + // Work around for situations with "I" followed by "'m" or "'ve": Only add a + // white space, if the token does not start with a ' + whiteSpaceInFront = !normalizedToken.startsWith("'"); + whiteSpaceBehindCurrentToken = true; + } - // Add the token from this line to the document's text - if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) { - currentText.append(whitespace); - } else { - currentText.append(nonWhitespace); - } - // If this line contains the start of a marking, we should keep track of it - if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { - // Create new marking - currentMarking = createNewMarking(token, currentText.length()); - markings.add(currentMarking); + // If the current marking is not null AND there is no annotation column or there + // is no MARKING_INSIDE annotation --> The previous marking ended + if (currentMarking != null && (token.length <= annotationColumn + || !token[annotationColumn].startsWith(MARKING_INSIDE))) { + currentMarking.setLength(currentText.length() - currentMarking.getStartPosition()); + currentMarking = null; + } + + // Add the token from this line to the document's text + if (whiteSpaceInFront && whiteSpaceBehindPreviousToken) { + currentText.append(whitespace); + } else { + currentText.append(nonWhitespace); + } + // If this line contains the start of a marking, we should keep track of it + if (token.length > annotationColumn && token[annotationColumn].startsWith(MARKING_START)) { + // Create new marking + currentMarking = createNewMarking(token, currentText.length()); + markings.add(currentMarking); + } + // TODO 1. make the whitespace configurable to allow other word separators 2. + // Remove the previous word separator if we have a punctuation character. + // (quotation, apostrophe) + currentText.append(normalizedToken); + } } - // TODO 1. make the whitespace configurable to allow other word separators 2. - // Remove the previous word separator if we have a punctuation character. - // (quotation, apostrophe) - currentText.append(normalizedToken); } // If there is an unfinished marking, finalize it if (currentMarking != null) { diff --git a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java index 45f156590..910dadaa7 100644 --- a/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java +++ b/src/test/java/org/aksw/gerbil/dataset/impl/ritter/RitterDatasetTest.java @@ -50,12 +50,12 @@ public static Collection data() { // solution (e.g. "I 'm") testConfigs.add(new Object[] { "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", - new TypedSpanImpl(63, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 }); + "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", + new TypedSpanImpl(61, 21, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 0 }); testConfigs.add(new Object[] { "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", - "@paulwalk It 's the view from where I 'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", - new TypedSpanImpl(86, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 }); + "@paulwalk It's the view from where I'm living for two weeks. Empire State Building= ESB. Pretty bad storm here last evening.", + new TypedSpanImpl(84, 3, new HashSet<>(Arrays.asList("http://dbpedia.org/ontology/Place"))), 0, 1 }); testConfigs.add(new Object[] { "@paulwalk O\nIt O\n's O\nthe O\nview O\nfrom O\nwhere O\nI O\n'm O\nliving O\nfor O\ntwo O\nweeks O\n. O\nEmpire B-facility\nState I-facility\nBuilding I-facility\n= O\nESB B-facility\n. O\nPretty O\nbad O\nstorm O\nhere O\nlast O\nevening O\n. O\n \nFrom O\nGreen O\nNewsfeed O\n: O\nAHFA B-other\nextends O\ndeadline O\nfor O\nSage B-other\nAward I-other\nto O\nNov O\n. O\n5 O\nhttp://tinyurl.com/24agj38 O", "From Green Newsfeed: AHFA extends deadline for Sage Award to Nov. 5 http://tinyurl.com/24agj38",