From f82969d9d56a03dfd9438664abcf4f4b6b67d9a8 Mon Sep 17 00:00:00 2001 From: Elena Merdjanovska Date: Tue, 23 Jul 2024 22:59:06 +0200 Subject: [PATCH 01/12] add intial noisebench dataset class --- flair/datasets/sequence_labeling.py | 51 +++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 38ca75e94b..044f2a8609 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4973,6 +4973,57 @@ def _write_instances(cls, version, base_path, split, data): out_file.write("\n") +class NER_NOISEBENCH(ColumnCorpus): + label_url = "https://github.com/elenamer/NoiseBench/tree/8a32da1e06f2239afc95b3f9dc5274abc25cc46d/data/annotations" + def __init__( + self, + noise: str = None, + base_path: Optional[Union[str, Path]] = None, + in_memory: bool = True, + **corpusargs, + ) -> None: + """Initialize the NoiseBench corpus. + + Args: + noise (string): Chooses the labelset for the data. + clean (default): Clean labels + crowd,crowdbest,expert,distant,weak,llm : Different kinds of noisy labelsets (details: ...) + base_path (Optional[Union[str, Path]]): Path to the data. + Default is None, meaning the corpus gets automatically downloaded and saved. + You can override this by passing a path to a directory containing the unprocessed files but typically this + should not be necessary. + in_memory (bool): If True the dataset is kept in memory achieving speedups in training. + **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'. + """ + if noise not in ['clean', None, 'crowd','crowdbest','expert','distant','weak','llm']: + raise Exception( + "Please choose a valid version" + ) + + base_path = self._set_path(base_path) + + filename = 'clean' if noise in ['clean',None] else f'noise_{noise}' + + cached_path(f"{self.label_url}/{filename}.traindev", base_path) + cached_path(f"{self.label_url}/index.txt", base_path) + + super().__init__( + data_folder=base_path, + train_file=f"{filename}.train", + dev_file=f"{filename}.dev", + test_file=f"clean.test", # test set is always clean (without noise) + column_format={0: "text", 1: "ner"}, + in_memory=in_memory, + column_delimiter="\t", + document_separator = "-DOCSTART-", + **corpusargs, + ) + + @classmethod + def _set_path(cls, base_path) -> Path: + base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) + return base_path + class MASAKHA_POS(MultiCorpus): def __init__( self, From 2ede7190217ec28320fa537f5f2d615d1c12bcf4 Mon Sep 17 00:00:00 2001 From: Elena Merdjanovska Date: Wed, 24 Jul 2024 09:34:56 +0200 Subject: [PATCH 02/12] add downloading of cleanconll --- flair/datasets/sequence_labeling.py | 32 ++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 044f2a8609..0c89793463 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5003,9 +5003,35 @@ def __init__( base_path = self._set_path(base_path) filename = 'clean' if noise in ['clean',None] else f'noise_{noise}' - - cached_path(f"{self.label_url}/{filename}.traindev", base_path) - cached_path(f"{self.label_url}/index.txt", base_path) + file_paths = [base_path / f'{filename}.train', base_path / f'{filename}.dev', base_path / 'clean.test'] + files_exist = [path.exists() for path in file_paths] + cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll" + + if not all(files_exist): + cached_path(f"{self.label_url}/{filename}.traindev", base_path / 'annotations_only') + cached_path(f"{self.label_url}/index.txt", base_path / 'annotations_only') + + cleanconll_files_exist = [Path(f'{cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']] + if not all(cleanconll_files_exist): + # download cleanconll + + clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {cleanconll_base_path}/CleanCoNLL" + os.system(clone) # Cloning + cwd = os.getcwd() + + os.chdir(f"{cleanconll_base_path}/CleanCoNLL") + chmod = f"chmod u+x create_cleanconll_from_conll03.sh" + os.system(chmod) + create = f"bash create_cleanconll_from_conll03.sh" + + os.system(create) + os.chdir(cwd) + + shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', cleanconll_base_path) + shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', cleanconll_base_path) + shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', cleanconll_base_path) + + # create dataset files from index and train/test splits super().__init__( data_folder=base_path, From a34d2b2e5c7c60c8efc7d6ea0cd28c46cb901a47 Mon Sep 17 00:00:00 2001 From: Elena Merdjanovska Date: Wed, 24 Jul 2024 09:35:11 +0200 Subject: [PATCH 03/12] update __init__ --- flair/datasets/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 2837e017c0..5a7ccf7e77 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -197,6 +197,7 @@ NER_ENGLISH_WIKIGOLD, NER_ENGLISH_WNUT_2020, NER_ESTONIAN_NOISY, + NER_NOISEBENCH, NER_FINNISH, NER_GERMAN_BIOFID, NER_GERMAN_EUROPARL, From 2684e93136fa7e61059aa9340cb23e5aa460fc0a Mon Sep 17 00:00:00 2001 From: Elena Merdjanovska Date: Wed, 24 Jul 2024 14:22:41 +0200 Subject: [PATCH 04/12] add processing of noisebench label sets --- flair/datasets/sequence_labeling.py | 144 ++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 18 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 0c89793463..9a30c05ee4 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4974,7 +4974,9 @@ def _write_instances(cls, version, base_path, split, data): class NER_NOISEBENCH(ColumnCorpus): - label_url = "https://github.com/elenamer/NoiseBench/tree/8a32da1e06f2239afc95b3f9dc5274abc25cc46d/data/annotations" + label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/" + SAVE_TRAINDEV_FILE = False + def __init__( self, noise: str = None, @@ -5000,26 +5002,26 @@ def __init__( "Please choose a valid version" ) - base_path = self._set_path(base_path) + self._set_path(base_path) filename = 'clean' if noise in ['clean',None] else f'noise_{noise}' - file_paths = [base_path / f'{filename}.train', base_path / f'{filename}.dev', base_path / 'clean.test'] + file_paths = [self.base_path / f'{filename}.train', self.base_path / f'{filename}.dev', self.base_path / 'clean.test'] files_exist = [path.exists() for path in file_paths] - cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll" + self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll" if not all(files_exist): - cached_path(f"{self.label_url}/{filename}.traindev", base_path / 'annotations_only') - cached_path(f"{self.label_url}/index.txt", base_path / 'annotations_only') + cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / 'annotations_only') + cached_path(f"{self.label_url}/index.txt", self.base_path / 'annotations_only') - cleanconll_files_exist = [Path(f'{cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']] + cleanconll_files_exist = [Path(f'{self.cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']] if not all(cleanconll_files_exist): # download cleanconll - clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {cleanconll_base_path}/CleanCoNLL" + clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" os.system(clone) # Cloning cwd = os.getcwd() - os.chdir(f"{cleanconll_base_path}/CleanCoNLL") + os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL") chmod = f"chmod u+x create_cleanconll_from_conll03.sh" os.system(chmod) create = f"bash create_cleanconll_from_conll03.sh" @@ -5027,29 +5029,135 @@ def __init__( os.system(create) os.chdir(cwd) - shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', cleanconll_base_path) - shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', cleanconll_base_path) - shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', cleanconll_base_path) + shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', self.cleanconll_base_path) + shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', self.cleanconll_base_path) + shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', self.cleanconll_base_path) + + shutil.rmtree(self.cleanconll_base_path / 'CleanCoNLL') # create dataset files from index and train/test splits + self.generate_data_files(filename,) super().__init__( - data_folder=base_path, + data_folder=self.base_path, train_file=f"{filename}.train", dev_file=f"{filename}.dev", test_file=f"clean.test", # test set is always clean (without noise) column_format={0: "text", 1: "ner"}, in_memory=in_memory, column_delimiter="\t", - document_separator = "-DOCSTART-", + document_separator_token = "-DOCSTART-", **corpusargs, ) - @classmethod - def _set_path(cls, base_path) -> Path: - base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) - return base_path + def _set_path(self, base_path) -> Path: + self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) + + @staticmethod + def read_column_file(filename): + raw = open(filename, 'r', errors='replace') + raw = raw.readlines() + all_x = [] + point = [] + for line in raw: + if '\t' in line.strip(): + stripped_line = line.strip().split('\t') + else: + stripped_line = line.strip().split(' ') + point.append(stripped_line) + if line.strip() == '': + if len(point[:-1]) > 0: + all_x.append(point[:-1]) + point = [] + + if len(point) > 0: + all_x.append(point) + + all_x = all_x + return all_x + + @staticmethod + def save_to_column_file(filename, list): + with open(filename, "w") as f: + for sentence in list: + for token in sentence: + f.write('\t'.join(token)) + f.write('\n') + f.write('\n') + + def _create_train_dev_splits(self, filename, all_sentences = None, datestring = '1996-08-24'): + if not all_sentences: + all_sentences = self.read_column_file(filename) + + train_sentences = [] + dev_sentences = [] + for i, s in enumerate(all_sentences): + if 'DOCSTART' in s[0][0]: + assert i+3 < len(all_sentences) # last document is too short + + # news date is usually in 3rd or 4th sentence of each article + if datestring in all_sentences[i+2][-1][0] or datestring in all_sentences[i+3][-1][0]: + save_to_dev = True + else: + save_to_dev = False + + if save_to_dev: + dev_sentences.append(s) + else: + train_sentences.append(s) + + self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.dev',dev_sentences) + self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.train',train_sentences) + + + def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): + # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file + + noisy_labels = self.read_column_file(os.path.join(self.base_path,'annotations_only',f'{corpus}.traindev')) + #print(noisy_labels) + #print(token_indices) + for index, sentence in zip(token_indices, noisy_labels): + + if index.strip() == 'docstart': + assert len(sentence) == 1 + sentence[0][0] = '-DOCSTART-' + continue + clean_sentence = all_clean_sentences[int(index.strip())] + + assert len(clean_sentence) == len(sentence) # this means indexing is wrong + + for token, label in zip(clean_sentence, sentence): + label[0] = token[0] # token[0] -> text, token[1] -> BIO label + if self.SAVE_TRAINDEV_FILE: + self.save_to_column_file(os.path.join(self.base_path,f'{corpus}.traindev'),noisy_labels) + return noisy_labels + + + def generate_data_files(self, filename): + + index_file = open(os.path.join(self.base_path,'annotations_only','index.txt')) + token_indices = index_file.readlines() + + all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.train')) + + #os.makedirs(os.path.join('data','noisebench'), exist_ok=True) + + noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices) + self._create_train_dev_splits(all_sentences=noisy_sentences,filename=os.path.join(self.base_path,f'{filename}.traindev')) + + # copy test set + all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.test')) + + test_sentences = [] + for s in all_clean_test_sentences: + new_s = [] + for token in s: + new_s.append([token[0],token[4]]) + test_sentences.append(new_s) + + self.save_to_column_file(os.path.join(self.base_path,f'clean.test'),test_sentences) + class MASAKHA_POS(MultiCorpus): def __init__( self, From 0e8183d7a75276e6c9188d0d91f6211e961c54cb Mon Sep 17 00:00:00 2001 From: Elena Merdjanovska Date: Wed, 24 Jul 2024 14:57:35 +0200 Subject: [PATCH 05/12] fix formatting --- flair/datasets/__init__.py | 3 +- flair/datasets/sequence_labeling.py | 172 +++++++++++++++------------- 2 files changed, 97 insertions(+), 78 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 5a7ccf7e77..b38d1bd761 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -197,7 +197,6 @@ NER_ENGLISH_WIKIGOLD, NER_ENGLISH_WNUT_2020, NER_ESTONIAN_NOISY, - NER_NOISEBENCH, NER_FINNISH, NER_GERMAN_BIOFID, NER_GERMAN_EUROPARL, @@ -217,6 +216,7 @@ NER_MULTI_WIKINER, NER_MULTI_XTREME, NER_NERMUD, + NER_NOISEBENCH, NER_SWEDISH, NER_TURKU, NER_UKRAINIAN, @@ -495,6 +495,7 @@ "NER_GERMAN_MOBIE", "NER_GERMAN_POLITICS", "NER_HIPE_2022", + "NER_NOISEBENCH", "NER_HUNGARIAN", "NER_ICDAR_EUROPEANA", "NER_ICELANDIC", diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 9a30c05ee4..7cc06c8a0a 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -4976,10 +4976,10 @@ def _write_instances(cls, version, base_path, split, data): class NER_NOISEBENCH(ColumnCorpus): label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/" SAVE_TRAINDEV_FILE = False - + def __init__( self, - noise: str = None, + noise: str = "clean", base_path: Optional[Union[str, Path]] = None, in_memory: bool = True, **corpusargs, @@ -4997,78 +4997,90 @@ def __init__( in_memory (bool): If True the dataset is kept in memory achieving speedups in training. **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'. """ - if noise not in ['clean', None, 'crowd','crowdbest','expert','distant','weak','llm']: - raise Exception( - "Please choose a valid version" - ) + if noise not in ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]: + raise Exception("Please choose a valid version") self._set_path(base_path) - filename = 'clean' if noise in ['clean',None] else f'noise_{noise}' - file_paths = [self.base_path / f'{filename}.train', self.base_path / f'{filename}.dev', self.base_path / 'clean.test'] + filename = "clean" if noise == "clean" else f"noise_{noise}" + file_paths = [ + self.base_path / f"{filename}.train", + self.base_path / f"{filename}.dev", + self.base_path / "clean.test", + ] files_exist = [path.exists() for path in file_paths] self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll" if not all(files_exist): - cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / 'annotations_only') - cached_path(f"{self.label_url}/index.txt", self.base_path / 'annotations_only') - - cleanconll_files_exist = [Path(f'{self.cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']] + cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only") + cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only") + + cleanconll_files_exist = [ + Path(f"{self.cleanconll_base_path}/cleanconll.{split}").exists() for split in ["train", "dev", "test"] + ] if not all(cleanconll_files_exist): # download cleanconll - clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" - os.system(clone) # Cloning + clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" + os.system(clone) # Cloning cwd = os.getcwd() os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL") - chmod = f"chmod u+x create_cleanconll_from_conll03.sh" + chmod = "chmod u+x create_cleanconll_from_conll03.sh" os.system(chmod) - create = f"bash create_cleanconll_from_conll03.sh" - + create = "bash create_cleanconll_from_conll03.sh" + os.system(create) os.chdir(cwd) - - shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', self.cleanconll_base_path) - shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', self.cleanconll_base_path) - shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', self.cleanconll_base_path) - - shutil.rmtree(self.cleanconll_base_path / 'CleanCoNLL') + + shutil.move( + f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train", + self.cleanconll_base_path, + ) + shutil.move( + f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev", self.cleanconll_base_path + ) + shutil.move( + f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test", self.cleanconll_base_path + ) + + shutil.rmtree(self.cleanconll_base_path / "CleanCoNLL") # create dataset files from index and train/test splits - self.generate_data_files(filename,) + self.generate_data_files( + filename, + ) super().__init__( data_folder=self.base_path, train_file=f"{filename}.train", dev_file=f"{filename}.dev", - test_file=f"clean.test", # test set is always clean (without noise) + test_file="clean.test", # test set is always clean (without noise) column_format={0: "text", 1: "ner"}, in_memory=in_memory, column_delimiter="\t", - document_separator_token = "-DOCSTART-", + document_separator_token="-DOCSTART-", **corpusargs, ) - def _set_path(self, base_path) -> Path: + def _set_path(self, base_path): self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) - - @staticmethod + + @staticmethod def read_column_file(filename): - raw = open(filename, 'r', errors='replace') - raw = raw.readlines() - all_x = [] - point = [] - for line in raw: - if '\t' in line.strip(): - stripped_line = line.strip().split('\t') - else: - stripped_line = line.strip().split(' ') - point.append(stripped_line) - if line.strip() == '': - if len(point[:-1]) > 0: - all_x.append(point[:-1]) - point = [] + with open(filename, errors="replace") as file: + lines = file.readlines() + all_x = [] + point = [] + for line in lines: + if "\t" in line.strip(): + stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ") + + point.append(stripped_line) + if line.strip() == "": + if len(point[:-1]) > 0: + all_x.append(point[:-1]) + point = [] if len(point) > 0: all_x.append(point) @@ -5081,22 +5093,22 @@ def save_to_column_file(filename, list): with open(filename, "w") as f: for sentence in list: for token in sentence: - f.write('\t'.join(token)) - f.write('\n') - f.write('\n') + f.write("\t".join(token)) + f.write("\n") + f.write("\n") - def _create_train_dev_splits(self, filename, all_sentences = None, datestring = '1996-08-24'): + def _create_train_dev_splits(self, filename, all_sentences=None, datestring="1996-08-24"): if not all_sentences: all_sentences = self.read_column_file(filename) - train_sentences = [] - dev_sentences = [] + train_sentences = [] + dev_sentences = [] for i, s in enumerate(all_sentences): - if 'DOCSTART' in s[0][0]: - assert i+3 < len(all_sentences) # last document is too short - + if "DOCSTART" in s[0][0]: + assert i + 3 < len(all_sentences) # last document is too short + # news date is usually in 3rd or 4th sentence of each article - if datestring in all_sentences[i+2][-1][0] or datestring in all_sentences[i+3][-1][0]: + if datestring in all_sentences[i + 2][-1][0] or datestring in all_sentences[i + 3][-1][0]: save_to_dev = True else: save_to_dev = False @@ -5106,57 +5118,63 @@ def _create_train_dev_splits(self, filename, all_sentences = None, datestring = else: train_sentences.append(s) - self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.dev',dev_sentences) - self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.train',train_sentences) - + self.save_to_column_file( + os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".dev", + dev_sentences, + ) + self.save_to_column_file( + os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".train", + train_sentences, + ) def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file - noisy_labels = self.read_column_file(os.path.join(self.base_path,'annotations_only',f'{corpus}.traindev')) - #print(noisy_labels) - #print(token_indices) + noisy_labels = self.read_column_file(os.path.join(self.base_path, "annotations_only", f"{corpus}.traindev")) + # print(noisy_labels) + # print(token_indices) for index, sentence in zip(token_indices, noisy_labels): - if index.strip() == 'docstart': + if index.strip() == "docstart": assert len(sentence) == 1 - sentence[0][0] = '-DOCSTART-' + sentence[0][0] = "-DOCSTART-" continue clean_sentence = all_clean_sentences[int(index.strip())] - assert len(clean_sentence) == len(sentence) # this means indexing is wrong + assert len(clean_sentence) == len(sentence) # this means indexing is wrong for token, label in zip(clean_sentence, sentence): - label[0] = token[0] # token[0] -> text, token[1] -> BIO label + label[0] = token[0] # token[0] -> text, token[1] -> BIO label if self.SAVE_TRAINDEV_FILE: - self.save_to_column_file(os.path.join(self.base_path,f'{corpus}.traindev'),noisy_labels) + self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels) return noisy_labels - def generate_data_files(self, filename): - index_file = open(os.path.join(self.base_path,'annotations_only','index.txt')) - token_indices = index_file.readlines() + with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file: + token_indices = index_file.readlines() - all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.train')) + all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.train")) - #os.makedirs(os.path.join('data','noisebench'), exist_ok=True) + # os.makedirs(os.path.join('data','noisebench'), exist_ok=True) - noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices) - self._create_train_dev_splits(all_sentences=noisy_sentences,filename=os.path.join(self.base_path,f'{filename}.traindev')) - + noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices) + self._create_train_dev_splits( + all_sentences=noisy_sentences, filename=os.path.join(self.base_path, f"{filename}.traindev") + ) + + # copy test set + all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.test")) - # copy test set - all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.test')) - test_sentences = [] for s in all_clean_test_sentences: new_s = [] for token in s: - new_s.append([token[0],token[4]]) + new_s.append([token[0], token[4]]) test_sentences.append(new_s) - self.save_to_column_file(os.path.join(self.base_path,f'clean.test'),test_sentences) + self.save_to_column_file(os.path.join(self.base_path, "clean.test"), test_sentences) + class MASAKHA_POS(MultiCorpus): def __init__( From 7903d2d8fd0d2c985563dd50e571b0b008c60d75 Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 12:57:00 +0100 Subject: [PATCH 06/12] change to use CLEANCONLL corpus --- flair/datasets/sequence_labeling.py | 41 +++++------------------------ 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 455d3c4104..5abd0b56a3 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5263,46 +5263,19 @@ def __init__( self.base_path / "clean.test", ] files_exist = [path.exists() for path in file_paths] - self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll" if not all(files_exist): cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only") cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only") + + cleanconll_corpus = CLEANCONLL() - cleanconll_files_exist = [ - Path(f"{self.cleanconll_base_path}/cleanconll.{split}").exists() for split in ["train", "dev", "test"] - ] - if not all(cleanconll_files_exist): - # download cleanconll - - clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" - os.system(clone) # Cloning - cwd = os.getcwd() - - os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL") - chmod = "chmod u+x create_cleanconll_from_conll03.sh" - os.system(chmod) - create = "bash create_cleanconll_from_conll03.sh" - - os.system(create) - os.chdir(cwd) - - shutil.move( - f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train", - self.cleanconll_base_path, - ) - shutil.move( - f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev", self.cleanconll_base_path - ) - shutil.move( - f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test", self.cleanconll_base_path - ) - - shutil.rmtree(self.cleanconll_base_path / "CleanCoNLL") + self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower() # create dataset files from index and train/test splits self.generate_data_files( filename, + cleanconll_corpus.__class__.__name__.lower() ) super().__init__( @@ -5403,12 +5376,12 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels) return noisy_labels - def generate_data_files(self, filename): + def generate_data_files(self, filename, origin_dataset_name): with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file: token_indices = index_file.readlines() - all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.train")) + all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.train")) # os.makedirs(os.path.join('data','noisebench'), exist_ok=True) @@ -5418,7 +5391,7 @@ def generate_data_files(self, filename): ) # copy test set - all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.test")) + all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.test")) test_sentences = [] for s in all_clean_test_sentences: From 1893d65c067857d375b99eeba0d2ce134f5c9b1b Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 15:02:11 +0100 Subject: [PATCH 07/12] raise ValueError and list supported noise types in the message --- flair/datasets/sequence_labeling.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 5abd0b56a3..f33c38a4a7 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5251,9 +5251,12 @@ def __init__( in_memory (bool): If True the dataset is kept in memory achieving speedups in training. **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'. """ - if noise not in ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]: - raise Exception("Please choose a valid version") + VALUE_NOISE_VALUES = ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"] + + if noise not in VALUE_NOISE_VALUES: + raise ValueError(f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!") + self._set_path(base_path) filename = "clean" if noise == "clean" else f"noise_{noise}" From 24a28291d3f7a63fdf001aa927823d0f1a56a946 Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 15:58:03 +0100 Subject: [PATCH 08/12] add file encoding --- flair/datasets/sequence_labeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index f33c38a4a7..862ca2bf44 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5298,7 +5298,7 @@ def _set_path(self, base_path): @staticmethod def read_column_file(filename): - with open(filename, errors="replace") as file: + with open(filename, "r", errors="replace", encoding="utf-8") as file: lines = file.readlines() all_x = [] point = [] @@ -5320,7 +5320,7 @@ def read_column_file(filename): @staticmethod def save_to_column_file(filename, list): - with open(filename, "w") as f: + with open(filename, "w", encoding="utf-8") as f: for sentence in list: for token in sentence: f.write("\t".join(token)) @@ -5381,7 +5381,7 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): def generate_data_files(self, filename, origin_dataset_name): - with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file: + with open(os.path.join(self.base_path, "annotations_only", "index.txt"), "r", encoding="utf-8") as index_file: token_indices = index_file.readlines() all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.train")) From ae10a61fc4a7068f9c34a5ca2be347134eec4d17 Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 16:01:01 +0100 Subject: [PATCH 09/12] simplify paths --- flair/datasets/sequence_labeling.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 862ca2bf44..1be58b9a37 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5257,7 +5257,7 @@ def __init__( if noise not in VALUE_NOISE_VALUES: raise ValueError(f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!") - self._set_path(base_path) + self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) filename = "clean" if noise == "clean" else f"noise_{noise}" file_paths = [ @@ -5293,9 +5293,6 @@ def __init__( **corpusargs, ) - def _set_path(self, base_path): - self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) - @staticmethod def read_column_file(filename): with open(filename, "r", errors="replace", encoding="utf-8") as file: @@ -5349,20 +5346,18 @@ def _create_train_dev_splits(self, filename, all_sentences=None, datestring="199 train_sentences.append(s) self.save_to_column_file( - os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".dev", + filename.parent / f"{filename.stem}.dev", dev_sentences, ) self.save_to_column_file( - os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".train", + filename.parent / f"{filename.stem}.train", train_sentences, ) def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file - noisy_labels = self.read_column_file(os.path.join(self.base_path, "annotations_only", f"{corpus}.traindev")) - # print(noisy_labels) - # print(token_indices) + noisy_labels = self.read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev") for index, sentence in zip(token_indices, noisy_labels): if index.strip() == "docstart": @@ -5376,25 +5371,24 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): for token, label in zip(clean_sentence, sentence): label[0] = token[0] # token[0] -> text, token[1] -> BIO label if self.SAVE_TRAINDEV_FILE: - self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels) + self.save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels) return noisy_labels def generate_data_files(self, filename, origin_dataset_name): - with open(os.path.join(self.base_path, "annotations_only", "index.txt"), "r", encoding="utf-8") as index_file: + with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as index_file: token_indices = index_file.readlines() - - all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.train")) + all_clean_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train") # os.makedirs(os.path.join('data','noisebench'), exist_ok=True) noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices) self._create_train_dev_splits( - all_sentences=noisy_sentences, filename=os.path.join(self.base_path, f"{filename}.traindev") + all_sentences=noisy_sentences, filename=self.base_path / f"{filename}.traindev" ) # copy test set - all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.test")) + all_clean_test_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test") test_sentences = [] for s in all_clean_test_sentences: @@ -5403,7 +5397,7 @@ def generate_data_files(self, filename, origin_dataset_name): new_s.append([token[0], token[4]]) test_sentences.append(new_s) - self.save_to_column_file(os.path.join(self.base_path, "clean.test"), test_sentences) + self.save_to_column_file(self.base_path / "clean.test", test_sentences) class MASAKHA_POS(MultiCorpus): From be83c9eafc997580e2dc04170562d4df5ec415af Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 16:18:56 +0100 Subject: [PATCH 10/12] make functions private and add type annotations --- flair/datasets/sequence_labeling.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 1be58b9a37..12da50bd49 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5276,7 +5276,7 @@ def __init__( self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower() # create dataset files from index and train/test splits - self.generate_data_files( + self._generate_data_files( filename, cleanconll_corpus.__class__.__name__.lower() ) @@ -5294,7 +5294,7 @@ def __init__( ) @staticmethod - def read_column_file(filename): + def _read_column_file(filename: Union[str, Path]) -> list[list[str]]: with open(filename, "r", errors="replace", encoding="utf-8") as file: lines = file.readlines() all_x = [] @@ -5316,7 +5316,7 @@ def read_column_file(filename): return all_x @staticmethod - def save_to_column_file(filename, list): + def _save_to_column_file(filename: Union[str, Path], list: list[list[str]]) -> None: with open(filename, "w", encoding="utf-8") as f: for sentence in list: for token in sentence: @@ -5324,9 +5324,9 @@ def save_to_column_file(filename, list): f.write("\n") f.write("\n") - def _create_train_dev_splits(self, filename, all_sentences=None, datestring="1996-08-24"): + def _create_train_dev_splits(self, filename: Path, all_sentences: list = None, datestring: str ="1996-08-24") -> None: if not all_sentences: - all_sentences = self.read_column_file(filename) + all_sentences = self._read_column_file(filename) train_sentences = [] dev_sentences = [] @@ -5345,19 +5345,19 @@ def _create_train_dev_splits(self, filename, all_sentences=None, datestring="199 else: train_sentences.append(s) - self.save_to_column_file( + self._save_to_column_file( filename.parent / f"{filename.stem}.dev", dev_sentences, ) - self.save_to_column_file( + self._save_to_column_file( filename.parent / f"{filename.stem}.train", train_sentences, ) - def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): + def _merge_tokens_labels(self, corpus: str, all_clean_sentences: list, token_indices: list) -> list[list[str]]: # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file - noisy_labels = self.read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev") + noisy_labels = self._read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev") for index, sentence in zip(token_indices, noisy_labels): if index.strip() == "docstart": @@ -5371,14 +5371,14 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): for token, label in zip(clean_sentence, sentence): label[0] = token[0] # token[0] -> text, token[1] -> BIO label if self.SAVE_TRAINDEV_FILE: - self.save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels) + self._save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels) return noisy_labels - def generate_data_files(self, filename, origin_dataset_name): + def _generate_data_files(self, filename: Union[str, Path], origin_dataset_name: str) -> None: with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as index_file: token_indices = index_file.readlines() - all_clean_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train") + all_clean_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train") # os.makedirs(os.path.join('data','noisebench'), exist_ok=True) @@ -5388,7 +5388,7 @@ def generate_data_files(self, filename, origin_dataset_name): ) # copy test set - all_clean_test_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test") + all_clean_test_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test") test_sentences = [] for s in all_clean_test_sentences: @@ -5397,7 +5397,7 @@ def generate_data_files(self, filename, origin_dataset_name): new_s.append([token[0], token[4]]) test_sentences.append(new_s) - self.save_to_column_file(self.base_path / "clean.test", test_sentences) + self._save_to_column_file(self.base_path / "clean.test", test_sentences) class MASAKHA_POS(MultiCorpus): From c7f4a6cd0448386252a55170719ab39fbaf46443 Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 16:20:42 +0100 Subject: [PATCH 11/12] rename some variables and small refactor --- flair/datasets/sequence_labeling.py | 37 +++++++++++++++-------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 12da50bd49..76c842a68e 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -5297,28 +5297,30 @@ def __init__( def _read_column_file(filename: Union[str, Path]) -> list[list[str]]: with open(filename, "r", errors="replace", encoding="utf-8") as file: lines = file.readlines() - all_x = [] - point = [] + all_sentences = [] + sentence = [] for line in lines: if "\t" in line.strip(): - stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ") + stripped_line = line.strip().split("\t") + else: + stripped_line = line.strip().split(" ") - point.append(stripped_line) + sentence.append(stripped_line) if line.strip() == "": - if len(point[:-1]) > 0: - all_x.append(point[:-1]) - point = [] + if len(sentence[:-1]) > 0: + all_sentences.append(sentence[:-1]) + sentence = [] - if len(point) > 0: - all_x.append(point) + if len(sentence) > 0: + all_sentences.append(sentence) - all_x = all_x - return all_x + all_sentences = all_sentences + return all_sentences @staticmethod - def _save_to_column_file(filename: Union[str, Path], list: list[list[str]]) -> None: + def _save_to_column_file(filename: Union[str, Path], sentences: list[list[str]]) -> None: with open(filename, "w", encoding="utf-8") as f: - for sentence in list: + for sentence in sentences: for token in sentence: f.write("\t".join(token)) f.write("\n") @@ -5391,11 +5393,10 @@ def _generate_data_files(self, filename: Union[str, Path], origin_dataset_name: all_clean_test_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test") test_sentences = [] - for s in all_clean_test_sentences: - new_s = [] - for token in s: - new_s.append([token[0], token[4]]) - test_sentences.append(new_s) + + for sentence in all_clean_test_sentences: + new_sentence = [[tokens[0], tokens[4]] for tokens in sentence] + test_sentences.append(new_sentence) self._save_to_column_file(self.base_path / "clean.test", test_sentences) From 4e159925f961d2475f0694cefaf1840c52f383fd Mon Sep 17 00:00:00 2001 From: elenamer Date: Fri, 13 Dec 2024 16:35:16 +0100 Subject: [PATCH 12/12] formatting and fix some typing --- flair/datasets/sequence_labeling.py | 41 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 76c842a68e..b2ab2f45dd 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -1,9 +1,9 @@ import copy +import gzip import json import logging import os import re -import gzip import shutil import tarfile import tempfile @@ -5251,12 +5251,13 @@ def __init__( in_memory (bool): If True the dataset is kept in memory achieving speedups in training. **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'. """ - VALUE_NOISE_VALUES = ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"] - + if noise not in VALUE_NOISE_VALUES: - raise ValueError(f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!") - + raise ValueError( + f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!" + ) + self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) filename = "clean" if noise == "clean" else f"noise_{noise}" @@ -5270,16 +5271,13 @@ def __init__( if not all(files_exist): cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only") cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only") - + cleanconll_corpus = CLEANCONLL() self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower() # create dataset files from index and train/test splits - self._generate_data_files( - filename, - cleanconll_corpus.__class__.__name__.lower() - ) + self._generate_data_files(filename, cleanconll_corpus.__class__.__name__.lower()) super().__init__( data_folder=self.base_path, @@ -5294,16 +5292,13 @@ def __init__( ) @staticmethod - def _read_column_file(filename: Union[str, Path]) -> list[list[str]]: - with open(filename, "r", errors="replace", encoding="utf-8") as file: + def _read_column_file(filename: Union[str, Path]) -> list[list[list[str]]]: + with open(filename, errors="replace", encoding="utf-8") as file: lines = file.readlines() all_sentences = [] sentence = [] for line in lines: - if "\t" in line.strip(): - stripped_line = line.strip().split("\t") - else: - stripped_line = line.strip().split(" ") + stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ") sentence.append(stripped_line) if line.strip() == "": @@ -5318,7 +5313,7 @@ def _read_column_file(filename: Union[str, Path]) -> list[list[str]]: return all_sentences @staticmethod - def _save_to_column_file(filename: Union[str, Path], sentences: list[list[str]]) -> None: + def _save_to_column_file(filename: Union[str, Path], sentences: list[list[list[str]]]) -> None: with open(filename, "w", encoding="utf-8") as f: for sentence in sentences: for token in sentence: @@ -5326,7 +5321,9 @@ def _save_to_column_file(filename: Union[str, Path], sentences: list[list[str]]) f.write("\n") f.write("\n") - def _create_train_dev_splits(self, filename: Path, all_sentences: list = None, datestring: str ="1996-08-24") -> None: + def _create_train_dev_splits( + self, filename: Path, all_sentences: Optional[list] = None, datestring: str = "1996-08-24" + ) -> None: if not all_sentences: all_sentences = self._read_column_file(filename) @@ -5356,7 +5353,9 @@ def _create_train_dev_splits(self, filename: Path, all_sentences: list = None, d train_sentences, ) - def _merge_tokens_labels(self, corpus: str, all_clean_sentences: list, token_indices: list) -> list[list[str]]: + def _merge_tokens_labels( + self, corpus: str, all_clean_sentences: list, token_indices: list + ) -> list[list[list[str]]]: # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file noisy_labels = self._read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev") @@ -5376,9 +5375,9 @@ def _merge_tokens_labels(self, corpus: str, all_clean_sentences: list, token_ind self._save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels) return noisy_labels - def _generate_data_files(self, filename: Union[str, Path], origin_dataset_name: str) -> None: + def _generate_data_files(self, filename: str, origin_dataset_name: str) -> None: - with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as index_file: + with open(self.base_path / "annotations_only" / "index.txt", encoding="utf-8") as index_file: token_indices = index_file.readlines() all_clean_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train")