From f82969d9d56a03dfd9438664abcf4f4b6b67d9a8 Mon Sep 17 00:00:00 2001
From: Elena Merdjanovska <elenamerdzanovska@yahoo.com>
Date: Tue, 23 Jul 2024 22:59:06 +0200
Subject: [PATCH 01/12] add intial noisebench dataset class

---
 flair/datasets/sequence_labeling.py | 51 +++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 38ca75e94b..044f2a8609 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4973,6 +4973,57 @@ def _write_instances(cls, version, base_path, split, data):
                 out_file.write("\n")
 
 
+class NER_NOISEBENCH(ColumnCorpus):
+    label_url = "https://github.com/elenamer/NoiseBench/tree/8a32da1e06f2239afc95b3f9dc5274abc25cc46d/data/annotations"
+    def __init__(
+        self,
+        noise: str = None,
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        **corpusargs,
+    ) -> None:
+        """Initialize the NoiseBench corpus.
+
+        Args:
+            noise (string): Chooses the labelset for the data.
+                clean (default): Clean labels
+                crowd,crowdbest,expert,distant,weak,llm : Different kinds of noisy labelsets (details: ...)
+            base_path (Optional[Union[str, Path]]): Path to the data.
+                Default is None, meaning the corpus gets automatically downloaded and saved.
+                You can override this by passing a path to a directory containing the unprocessed files but typically this
+                should not be necessary.
+            in_memory (bool): If True the dataset is kept in memory achieving speedups in training.
+            **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'.
+        """
+        if noise not in ['clean', None, 'crowd','crowdbest','expert','distant','weak','llm']:
+            raise Exception(
+                "Please choose a valid version"
+            )
+
+        base_path = self._set_path(base_path)
+
+        filename = 'clean' if noise in ['clean',None] else f'noise_{noise}'
+
+        cached_path(f"{self.label_url}/{filename}.traindev", base_path)
+        cached_path(f"{self.label_url}/index.txt", base_path)
+
+        super().__init__(
+            data_folder=base_path,
+            train_file=f"{filename}.train",
+            dev_file=f"{filename}.dev",
+            test_file=f"clean.test", # test set is always clean (without noise)
+            column_format={0: "text", 1: "ner"},
+            in_memory=in_memory,
+            column_delimiter="\t",
+            document_separator = "-DOCSTART-",
+            **corpusargs,
+        )
+
+    @classmethod
+    def _set_path(cls, base_path) -> Path:
+        base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
+        return base_path
+    
 class MASAKHA_POS(MultiCorpus):
     def __init__(
         self,

From 2ede7190217ec28320fa537f5f2d615d1c12bcf4 Mon Sep 17 00:00:00 2001
From: Elena Merdjanovska <elenamerdzanovska@yahoo.com>
Date: Wed, 24 Jul 2024 09:34:56 +0200
Subject: [PATCH 02/12] add downloading of cleanconll

---
 flair/datasets/sequence_labeling.py | 32 ++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 044f2a8609..0c89793463 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5003,9 +5003,35 @@ def __init__(
         base_path = self._set_path(base_path)
 
         filename = 'clean' if noise in ['clean',None] else f'noise_{noise}'
-
-        cached_path(f"{self.label_url}/{filename}.traindev", base_path)
-        cached_path(f"{self.label_url}/index.txt", base_path)
+        file_paths = [base_path / f'{filename}.train', base_path / f'{filename}.dev', base_path / 'clean.test']
+        files_exist = [path.exists() for path in file_paths]
+        cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll"
+
+        if not all(files_exist):
+            cached_path(f"{self.label_url}/{filename}.traindev", base_path / 'annotations_only')
+            cached_path(f"{self.label_url}/index.txt", base_path / 'annotations_only')
+            
+            cleanconll_files_exist = [Path(f'{cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']]
+            if not all(cleanconll_files_exist):
+                # download cleanconll
+
+                clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {cleanconll_base_path}/CleanCoNLL" 
+                os.system(clone) # Cloning
+                cwd = os.getcwd()
+
+                os.chdir(f"{cleanconll_base_path}/CleanCoNLL")
+                chmod = f"chmod u+x create_cleanconll_from_conll03.sh"
+                os.system(chmod)
+                create = f"bash create_cleanconll_from_conll03.sh"
+                
+                os.system(create)
+                os.chdir(cwd)
+                
+                shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', cleanconll_base_path)
+                shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', cleanconll_base_path)
+                shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', cleanconll_base_path)
+
+            # create dataset files from index and train/test splits
 
         super().__init__(
             data_folder=base_path,

From a34d2b2e5c7c60c8efc7d6ea0cd28c46cb901a47 Mon Sep 17 00:00:00 2001
From: Elena Merdjanovska <elenamerdzanovska@yahoo.com>
Date: Wed, 24 Jul 2024 09:35:11 +0200
Subject: [PATCH 03/12] update __init__

---
 flair/datasets/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 2837e017c0..5a7ccf7e77 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -197,6 +197,7 @@
     NER_ENGLISH_WIKIGOLD,
     NER_ENGLISH_WNUT_2020,
     NER_ESTONIAN_NOISY,
+    NER_NOISEBENCH,
     NER_FINNISH,
     NER_GERMAN_BIOFID,
     NER_GERMAN_EUROPARL,

From 2684e93136fa7e61059aa9340cb23e5aa460fc0a Mon Sep 17 00:00:00 2001
From: Elena Merdjanovska <elenamerdzanovska@yahoo.com>
Date: Wed, 24 Jul 2024 14:22:41 +0200
Subject: [PATCH 04/12] add processing of noisebench label sets

---
 flair/datasets/sequence_labeling.py | 144 ++++++++++++++++++++++++----
 1 file changed, 126 insertions(+), 18 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 0c89793463..9a30c05ee4 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4974,7 +4974,9 @@ def _write_instances(cls, version, base_path, split, data):
 
 
 class NER_NOISEBENCH(ColumnCorpus):
-    label_url = "https://github.com/elenamer/NoiseBench/tree/8a32da1e06f2239afc95b3f9dc5274abc25cc46d/data/annotations"
+    label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/"
+    SAVE_TRAINDEV_FILE = False
+    
     def __init__(
         self,
         noise: str = None,
@@ -5000,26 +5002,26 @@ def __init__(
                 "Please choose a valid version"
             )
 
-        base_path = self._set_path(base_path)
+        self._set_path(base_path)
 
         filename = 'clean' if noise in ['clean',None] else f'noise_{noise}'
-        file_paths = [base_path / f'{filename}.train', base_path / f'{filename}.dev', base_path / 'clean.test']
+        file_paths = [self.base_path / f'{filename}.train', self.base_path / f'{filename}.dev', self.base_path / 'clean.test']
         files_exist = [path.exists() for path in file_paths]
-        cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll"
+        self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll"
 
         if not all(files_exist):
-            cached_path(f"{self.label_url}/{filename}.traindev", base_path / 'annotations_only')
-            cached_path(f"{self.label_url}/index.txt", base_path / 'annotations_only')
+            cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / 'annotations_only')
+            cached_path(f"{self.label_url}/index.txt", self.base_path / 'annotations_only')
             
-            cleanconll_files_exist = [Path(f'{cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']]
+            cleanconll_files_exist = [Path(f'{self.cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']]
             if not all(cleanconll_files_exist):
                 # download cleanconll
 
-                clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {cleanconll_base_path}/CleanCoNLL" 
+                clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" 
                 os.system(clone) # Cloning
                 cwd = os.getcwd()
 
-                os.chdir(f"{cleanconll_base_path}/CleanCoNLL")
+                os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL")
                 chmod = f"chmod u+x create_cleanconll_from_conll03.sh"
                 os.system(chmod)
                 create = f"bash create_cleanconll_from_conll03.sh"
@@ -5027,29 +5029,135 @@ def __init__(
                 os.system(create)
                 os.chdir(cwd)
                 
-                shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', cleanconll_base_path)
-                shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', cleanconll_base_path)
-                shutil.move(f'{cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', cleanconll_base_path)
+                shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', self.cleanconll_base_path)
+                shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', self.cleanconll_base_path)
+                shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', self.cleanconll_base_path)
+            
+                shutil.rmtree(self.cleanconll_base_path / 'CleanCoNLL')
 
             # create dataset files from index and train/test splits
+            self.generate_data_files(filename,)
 
         super().__init__(
-            data_folder=base_path,
+            data_folder=self.base_path,
             train_file=f"{filename}.train",
             dev_file=f"{filename}.dev",
             test_file=f"clean.test", # test set is always clean (without noise)
             column_format={0: "text", 1: "ner"},
             in_memory=in_memory,
             column_delimiter="\t",
-            document_separator = "-DOCSTART-",
+            document_separator_token = "-DOCSTART-",
             **corpusargs,
         )
 
-    @classmethod
-    def _set_path(cls, base_path) -> Path:
-        base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
-        return base_path
+    def _set_path(self, base_path) -> Path:
+        self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
+    
+    @staticmethod   
+    def read_column_file(filename):
+        raw = open(filename, 'r', errors='replace')
+        raw = raw.readlines()
+        all_x = []
+        point = []
+        for line in raw:
+            if '\t' in line.strip():
+                stripped_line = line.strip().split('\t')
+            else:
+                stripped_line = line.strip().split(' ')
+            point.append(stripped_line)
+            if line.strip() == '':
+                if len(point[:-1]) > 0:
+                    all_x.append(point[:-1])
+                point = []
+
+        if len(point) > 0:
+            all_x.append(point)
+
+        all_x = all_x
+        return all_x
+
+    @staticmethod
+    def save_to_column_file(filename, list):
+        with open(filename, "w") as f:
+            for sentence in list:
+                for token in sentence:
+                    f.write('\t'.join(token))
+                    f.write('\n')
+                f.write('\n')
+
+    def _create_train_dev_splits(self, filename, all_sentences = None, datestring = '1996-08-24'):
+        if not all_sentences:
+            all_sentences = self.read_column_file(filename)
+
+        train_sentences = [] 
+        dev_sentences = [] 
+        for i, s in enumerate(all_sentences):
+            if 'DOCSTART' in s[0][0]:
+                assert i+3 < len(all_sentences) # last document is too short
+                
+                # news date is usually in 3rd or 4th sentence of each article
+                if datestring in all_sentences[i+2][-1][0] or datestring in all_sentences[i+3][-1][0]:
+                    save_to_dev = True
+                else:
+                    save_to_dev = False
+
+            if save_to_dev:
+                dev_sentences.append(s)
+            else:
+                train_sentences.append(s)
+
+        self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.dev',dev_sentences)
+        self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.train',train_sentences)
+
+
+    def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
+        # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file
+
+        noisy_labels = self.read_column_file(os.path.join(self.base_path,'annotations_only',f'{corpus}.traindev'))
+        #print(noisy_labels)
+        #print(token_indices)
+        for index, sentence in zip(token_indices, noisy_labels):
+
+            if index.strip() == 'docstart':
+                assert len(sentence) == 1
+                sentence[0][0] = '-DOCSTART-'
+                continue
+            clean_sentence = all_clean_sentences[int(index.strip())]
+
+            assert len(clean_sentence) == len(sentence) # this means indexing is wrong
+
+            for token, label in zip(clean_sentence, sentence):
+                label[0] = token[0] # token[0] -> text, token[1] -> BIO label
+        if self.SAVE_TRAINDEV_FILE:
+            self.save_to_column_file(os.path.join(self.base_path,f'{corpus}.traindev'),noisy_labels)
+        return noisy_labels
+
+
+    def generate_data_files(self, filename):
+
+        index_file = open(os.path.join(self.base_path,'annotations_only','index.txt'))
+        token_indices = index_file.readlines()
+
+        all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.train'))
+
+        #os.makedirs(os.path.join('data','noisebench'), exist_ok=True)
+
+        noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices)
+        self._create_train_dev_splits(all_sentences=noisy_sentences,filename=os.path.join(self.base_path,f'{filename}.traindev'))
     
+
+        # copy test set 
+        all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.test'))
+        
+        test_sentences = []
+        for s in all_clean_test_sentences:
+            new_s = []
+            for token in s:
+                new_s.append([token[0],token[4]])
+            test_sentences.append(new_s)
+
+        self.save_to_column_file(os.path.join(self.base_path,f'clean.test'),test_sentences)
+
 class MASAKHA_POS(MultiCorpus):
     def __init__(
         self,

From 0e8183d7a75276e6c9188d0d91f6211e961c54cb Mon Sep 17 00:00:00 2001
From: Elena Merdjanovska <elenamerdzanovska@yahoo.com>
Date: Wed, 24 Jul 2024 14:57:35 +0200
Subject: [PATCH 05/12] fix formatting

---
 flair/datasets/__init__.py          |   3 +-
 flair/datasets/sequence_labeling.py | 172 +++++++++++++++-------------
 2 files changed, 97 insertions(+), 78 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 5a7ccf7e77..b38d1bd761 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -197,7 +197,6 @@
     NER_ENGLISH_WIKIGOLD,
     NER_ENGLISH_WNUT_2020,
     NER_ESTONIAN_NOISY,
-    NER_NOISEBENCH,
     NER_FINNISH,
     NER_GERMAN_BIOFID,
     NER_GERMAN_EUROPARL,
@@ -217,6 +216,7 @@
     NER_MULTI_WIKINER,
     NER_MULTI_XTREME,
     NER_NERMUD,
+    NER_NOISEBENCH,
     NER_SWEDISH,
     NER_TURKU,
     NER_UKRAINIAN,
@@ -495,6 +495,7 @@
     "NER_GERMAN_MOBIE",
     "NER_GERMAN_POLITICS",
     "NER_HIPE_2022",
+    "NER_NOISEBENCH",
     "NER_HUNGARIAN",
     "NER_ICDAR_EUROPEANA",
     "NER_ICELANDIC",
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 9a30c05ee4..7cc06c8a0a 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4976,10 +4976,10 @@ def _write_instances(cls, version, base_path, split, data):
 class NER_NOISEBENCH(ColumnCorpus):
     label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/"
     SAVE_TRAINDEV_FILE = False
-    
+
     def __init__(
         self,
-        noise: str = None,
+        noise: str = "clean",
         base_path: Optional[Union[str, Path]] = None,
         in_memory: bool = True,
         **corpusargs,
@@ -4997,78 +4997,90 @@ def __init__(
             in_memory (bool): If True the dataset is kept in memory achieving speedups in training.
             **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'.
         """
-        if noise not in ['clean', None, 'crowd','crowdbest','expert','distant','weak','llm']:
-            raise Exception(
-                "Please choose a valid version"
-            )
+        if noise not in ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]:
+            raise Exception("Please choose a valid version")
 
         self._set_path(base_path)
 
-        filename = 'clean' if noise in ['clean',None] else f'noise_{noise}'
-        file_paths = [self.base_path / f'{filename}.train', self.base_path / f'{filename}.dev', self.base_path / 'clean.test']
+        filename = "clean" if noise == "clean" else f"noise_{noise}"
+        file_paths = [
+            self.base_path / f"{filename}.train",
+            self.base_path / f"{filename}.dev",
+            self.base_path / "clean.test",
+        ]
         files_exist = [path.exists() for path in file_paths]
         self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll"
 
         if not all(files_exist):
-            cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / 'annotations_only')
-            cached_path(f"{self.label_url}/index.txt", self.base_path / 'annotations_only')
-            
-            cleanconll_files_exist = [Path(f'{self.cleanconll_base_path}/cleanconll.{split}').exists() for split in ['train','dev','test']]
+            cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only")
+            cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only")
+
+            cleanconll_files_exist = [
+                Path(f"{self.cleanconll_base_path}/cleanconll.{split}").exists() for split in ["train", "dev", "test"]
+            ]
             if not all(cleanconll_files_exist):
                 # download cleanconll
 
-                clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" 
-                os.system(clone) # Cloning
+                clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL"
+                os.system(clone)  # Cloning
                 cwd = os.getcwd()
 
                 os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL")
-                chmod = f"chmod u+x create_cleanconll_from_conll03.sh"
+                chmod = "chmod u+x create_cleanconll_from_conll03.sh"
                 os.system(chmod)
-                create = f"bash create_cleanconll_from_conll03.sh"
-                
+                create = "bash create_cleanconll_from_conll03.sh"
+
                 os.system(create)
                 os.chdir(cwd)
-                
-                shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train', self.cleanconll_base_path)
-                shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev', self.cleanconll_base_path)
-                shutil.move(f'{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test', self.cleanconll_base_path)
-            
-                shutil.rmtree(self.cleanconll_base_path / 'CleanCoNLL')
+
+                shutil.move(
+                    f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train",
+                    self.cleanconll_base_path,
+                )
+                shutil.move(
+                    f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev", self.cleanconll_base_path
+                )
+                shutil.move(
+                    f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test", self.cleanconll_base_path
+                )
+
+                shutil.rmtree(self.cleanconll_base_path / "CleanCoNLL")
 
             # create dataset files from index and train/test splits
-            self.generate_data_files(filename,)
+            self.generate_data_files(
+                filename,
+            )
 
         super().__init__(
             data_folder=self.base_path,
             train_file=f"{filename}.train",
             dev_file=f"{filename}.dev",
-            test_file=f"clean.test", # test set is always clean (without noise)
+            test_file="clean.test",  # test set is always clean (without noise)
             column_format={0: "text", 1: "ner"},
             in_memory=in_memory,
             column_delimiter="\t",
-            document_separator_token = "-DOCSTART-",
+            document_separator_token="-DOCSTART-",
             **corpusargs,
         )
 
-    def _set_path(self, base_path) -> Path:
+    def _set_path(self, base_path):
         self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
-    
-    @staticmethod   
+
+    @staticmethod
     def read_column_file(filename):
-        raw = open(filename, 'r', errors='replace')
-        raw = raw.readlines()
-        all_x = []
-        point = []
-        for line in raw:
-            if '\t' in line.strip():
-                stripped_line = line.strip().split('\t')
-            else:
-                stripped_line = line.strip().split(' ')
-            point.append(stripped_line)
-            if line.strip() == '':
-                if len(point[:-1]) > 0:
-                    all_x.append(point[:-1])
-                point = []
+        with open(filename, errors="replace") as file:
+            lines = file.readlines()
+            all_x = []
+            point = []
+            for line in lines:
+                if "\t" in line.strip():
+                    stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ")
+
+                point.append(stripped_line)
+                if line.strip() == "":
+                    if len(point[:-1]) > 0:
+                        all_x.append(point[:-1])
+                    point = []
 
         if len(point) > 0:
             all_x.append(point)
@@ -5081,22 +5093,22 @@ def save_to_column_file(filename, list):
         with open(filename, "w") as f:
             for sentence in list:
                 for token in sentence:
-                    f.write('\t'.join(token))
-                    f.write('\n')
-                f.write('\n')
+                    f.write("\t".join(token))
+                    f.write("\n")
+                f.write("\n")
 
-    def _create_train_dev_splits(self, filename, all_sentences = None, datestring = '1996-08-24'):
+    def _create_train_dev_splits(self, filename, all_sentences=None, datestring="1996-08-24"):
         if not all_sentences:
             all_sentences = self.read_column_file(filename)
 
-        train_sentences = [] 
-        dev_sentences = [] 
+        train_sentences = []
+        dev_sentences = []
         for i, s in enumerate(all_sentences):
-            if 'DOCSTART' in s[0][0]:
-                assert i+3 < len(all_sentences) # last document is too short
-                
+            if "DOCSTART" in s[0][0]:
+                assert i + 3 < len(all_sentences)  # last document is too short
+
                 # news date is usually in 3rd or 4th sentence of each article
-                if datestring in all_sentences[i+2][-1][0] or datestring in all_sentences[i+3][-1][0]:
+                if datestring in all_sentences[i + 2][-1][0] or datestring in all_sentences[i + 3][-1][0]:
                     save_to_dev = True
                 else:
                     save_to_dev = False
@@ -5106,57 +5118,63 @@ def _create_train_dev_splits(self, filename, all_sentences = None, datestring =
             else:
                 train_sentences.append(s)
 
-        self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.dev',dev_sentences)
-        self.save_to_column_file(os.sep.join(filename.split(os.sep)[:-1])+os.sep+filename.split(os.sep)[-1].split('.')[0]+'.train',train_sentences)
-
+        self.save_to_column_file(
+            os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".dev",
+            dev_sentences,
+        )
+        self.save_to_column_file(
+            os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".train",
+            train_sentences,
+        )
 
     def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
         # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file
 
-        noisy_labels = self.read_column_file(os.path.join(self.base_path,'annotations_only',f'{corpus}.traindev'))
-        #print(noisy_labels)
-        #print(token_indices)
+        noisy_labels = self.read_column_file(os.path.join(self.base_path, "annotations_only", f"{corpus}.traindev"))
+        # print(noisy_labels)
+        # print(token_indices)
         for index, sentence in zip(token_indices, noisy_labels):
 
-            if index.strip() == 'docstart':
+            if index.strip() == "docstart":
                 assert len(sentence) == 1
-                sentence[0][0] = '-DOCSTART-'
+                sentence[0][0] = "-DOCSTART-"
                 continue
             clean_sentence = all_clean_sentences[int(index.strip())]
 
-            assert len(clean_sentence) == len(sentence) # this means indexing is wrong
+            assert len(clean_sentence) == len(sentence)  # this means indexing is wrong
 
             for token, label in zip(clean_sentence, sentence):
-                label[0] = token[0] # token[0] -> text, token[1] -> BIO label
+                label[0] = token[0]  # token[0] -> text, token[1] -> BIO label
         if self.SAVE_TRAINDEV_FILE:
-            self.save_to_column_file(os.path.join(self.base_path,f'{corpus}.traindev'),noisy_labels)
+            self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels)
         return noisy_labels
 
-
     def generate_data_files(self, filename):
 
-        index_file = open(os.path.join(self.base_path,'annotations_only','index.txt'))
-        token_indices = index_file.readlines()
+        with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file:
+            token_indices = index_file.readlines()
 
-        all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.train'))
+            all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.train"))
 
-        #os.makedirs(os.path.join('data','noisebench'), exist_ok=True)
+            # os.makedirs(os.path.join('data','noisebench'), exist_ok=True)
 
-        noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices)
-        self._create_train_dev_splits(all_sentences=noisy_sentences,filename=os.path.join(self.base_path,f'{filename}.traindev'))
-    
+            noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices)
+            self._create_train_dev_splits(
+                all_sentences=noisy_sentences, filename=os.path.join(self.base_path, f"{filename}.traindev")
+            )
+
+        # copy test set
+        all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.test"))
 
-        # copy test set 
-        all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path,'cleanconll.test'))
-        
         test_sentences = []
         for s in all_clean_test_sentences:
             new_s = []
             for token in s:
-                new_s.append([token[0],token[4]])
+                new_s.append([token[0], token[4]])
             test_sentences.append(new_s)
 
-        self.save_to_column_file(os.path.join(self.base_path,f'clean.test'),test_sentences)
+        self.save_to_column_file(os.path.join(self.base_path, "clean.test"), test_sentences)
+
 
 class MASAKHA_POS(MultiCorpus):
     def __init__(

From 7903d2d8fd0d2c985563dd50e571b0b008c60d75 Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 12:57:00 +0100
Subject: [PATCH 06/12] change to use CLEANCONLL corpus

---
 flair/datasets/sequence_labeling.py | 41 +++++------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 455d3c4104..5abd0b56a3 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5263,46 +5263,19 @@ def __init__(
             self.base_path / "clean.test",
         ]
         files_exist = [path.exists() for path in file_paths]
-        self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll"
 
         if not all(files_exist):
             cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only")
             cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only")
+            
+            cleanconll_corpus = CLEANCONLL()
 
-            cleanconll_files_exist = [
-                Path(f"{self.cleanconll_base_path}/cleanconll.{split}").exists() for split in ["train", "dev", "test"]
-            ]
-            if not all(cleanconll_files_exist):
-                # download cleanconll
-
-                clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL"
-                os.system(clone)  # Cloning
-                cwd = os.getcwd()
-
-                os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL")
-                chmod = "chmod u+x create_cleanconll_from_conll03.sh"
-                os.system(chmod)
-                create = "bash create_cleanconll_from_conll03.sh"
-
-                os.system(create)
-                os.chdir(cwd)
-
-                shutil.move(
-                    f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train",
-                    self.cleanconll_base_path,
-                )
-                shutil.move(
-                    f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev", self.cleanconll_base_path
-                )
-                shutil.move(
-                    f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test", self.cleanconll_base_path
-                )
-
-                shutil.rmtree(self.cleanconll_base_path / "CleanCoNLL")
+            self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower()
 
             # create dataset files from index and train/test splits
             self.generate_data_files(
                 filename,
+                cleanconll_corpus.__class__.__name__.lower()
             )
 
         super().__init__(
@@ -5403,12 +5376,12 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
             self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels)
         return noisy_labels
 
-    def generate_data_files(self, filename):
+    def generate_data_files(self, filename, origin_dataset_name):
 
         with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file:
             token_indices = index_file.readlines()
 
-            all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.train"))
+            all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.train"))
 
             # os.makedirs(os.path.join('data','noisebench'), exist_ok=True)
 
@@ -5418,7 +5391,7 @@ def generate_data_files(self, filename):
             )
 
         # copy test set
-        all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.test"))
+        all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.test"))
 
         test_sentences = []
         for s in all_clean_test_sentences:

From 1893d65c067857d375b99eeba0d2ce134f5c9b1b Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 15:02:11 +0100
Subject: [PATCH 07/12] raise ValueError and list supported noise types in the
 message

---
 flair/datasets/sequence_labeling.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 5abd0b56a3..f33c38a4a7 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5251,9 +5251,12 @@ def __init__(
             in_memory (bool): If True the dataset is kept in memory achieving speedups in training.
             **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'.
         """
-        if noise not in ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]:
-            raise Exception("Please choose a valid version")
 
+        VALUE_NOISE_VALUES = ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]
+        
+        if noise not in VALUE_NOISE_VALUES:
+            raise ValueError(f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!")
+        
         self._set_path(base_path)
 
         filename = "clean" if noise == "clean" else f"noise_{noise}"

From 24a28291d3f7a63fdf001aa927823d0f1a56a946 Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 15:58:03 +0100
Subject: [PATCH 08/12] add file encoding

---
 flair/datasets/sequence_labeling.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index f33c38a4a7..862ca2bf44 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5298,7 +5298,7 @@ def _set_path(self, base_path):
 
     @staticmethod
     def read_column_file(filename):
-        with open(filename, errors="replace") as file:
+        with open(filename, "r", errors="replace", encoding="utf-8") as file:
             lines = file.readlines()
             all_x = []
             point = []
@@ -5320,7 +5320,7 @@ def read_column_file(filename):
 
     @staticmethod
     def save_to_column_file(filename, list):
-        with open(filename, "w") as f:
+        with open(filename, "w", encoding="utf-8") as f:
             for sentence in list:
                 for token in sentence:
                     f.write("\t".join(token))
@@ -5381,7 +5381,7 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
 
     def generate_data_files(self, filename, origin_dataset_name):
 
-        with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file:
+        with open(os.path.join(self.base_path, "annotations_only", "index.txt"), "r", encoding="utf-8") as index_file:
             token_indices = index_file.readlines()
 
             all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.train"))

From ae10a61fc4a7068f9c34a5ca2be347134eec4d17 Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 16:01:01 +0100
Subject: [PATCH 09/12] simplify paths

---
 flair/datasets/sequence_labeling.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 862ca2bf44..1be58b9a37 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5257,7 +5257,7 @@ def __init__(
         if noise not in VALUE_NOISE_VALUES:
             raise ValueError(f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!")
         
-        self._set_path(base_path)
+        self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
 
         filename = "clean" if noise == "clean" else f"noise_{noise}"
         file_paths = [
@@ -5293,9 +5293,6 @@ def __init__(
             **corpusargs,
         )
 
-    def _set_path(self, base_path):
-        self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
-
     @staticmethod
     def read_column_file(filename):
         with open(filename, "r", errors="replace", encoding="utf-8") as file:
@@ -5349,20 +5346,18 @@ def _create_train_dev_splits(self, filename, all_sentences=None, datestring="199
                 train_sentences.append(s)
 
         self.save_to_column_file(
-            os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".dev",
+            filename.parent / f"{filename.stem}.dev",
             dev_sentences,
         )
         self.save_to_column_file(
-            os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".train",
+            filename.parent / f"{filename.stem}.train",
             train_sentences,
         )
 
     def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
         # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file
 
-        noisy_labels = self.read_column_file(os.path.join(self.base_path, "annotations_only", f"{corpus}.traindev"))
-        # print(noisy_labels)
-        # print(token_indices)
+        noisy_labels = self.read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev")
         for index, sentence in zip(token_indices, noisy_labels):
 
             if index.strip() == "docstart":
@@ -5376,25 +5371,24 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
             for token, label in zip(clean_sentence, sentence):
                 label[0] = token[0]  # token[0] -> text, token[1] -> BIO label
         if self.SAVE_TRAINDEV_FILE:
-            self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels)
+            self.save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels)
         return noisy_labels
 
     def generate_data_files(self, filename, origin_dataset_name):
 
-        with open(os.path.join(self.base_path, "annotations_only", "index.txt"), "r", encoding="utf-8") as index_file:
+        with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as index_file:
             token_indices = index_file.readlines()
-
-            all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.train"))
+            all_clean_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train")
 
             # os.makedirs(os.path.join('data','noisebench'), exist_ok=True)
 
             noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices)
             self._create_train_dev_splits(
-                all_sentences=noisy_sentences, filename=os.path.join(self.base_path, f"{filename}.traindev")
+                all_sentences=noisy_sentences, filename=self.base_path / f"{filename}.traindev"
             )
 
         # copy test set
-        all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, f"{origin_dataset_name}.test"))
+        all_clean_test_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test")
 
         test_sentences = []
         for s in all_clean_test_sentences:
@@ -5403,7 +5397,7 @@ def generate_data_files(self, filename, origin_dataset_name):
                 new_s.append([token[0], token[4]])
             test_sentences.append(new_s)
 
-        self.save_to_column_file(os.path.join(self.base_path, "clean.test"), test_sentences)
+        self.save_to_column_file(self.base_path / "clean.test", test_sentences)
 
 
 class MASAKHA_POS(MultiCorpus):

From be83c9eafc997580e2dc04170562d4df5ec415af Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 16:18:56 +0100
Subject: [PATCH 10/12] make functions private and add type annotations

---
 flair/datasets/sequence_labeling.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 1be58b9a37..12da50bd49 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5276,7 +5276,7 @@ def __init__(
             self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower()
 
             # create dataset files from index and train/test splits
-            self.generate_data_files(
+            self._generate_data_files(
                 filename,
                 cleanconll_corpus.__class__.__name__.lower()
             )
@@ -5294,7 +5294,7 @@ def __init__(
         )
 
     @staticmethod
-    def read_column_file(filename):
+    def _read_column_file(filename: Union[str, Path]) -> list[list[str]]:
         with open(filename, "r", errors="replace", encoding="utf-8") as file:
             lines = file.readlines()
             all_x = []
@@ -5316,7 +5316,7 @@ def read_column_file(filename):
         return all_x
 
     @staticmethod
-    def save_to_column_file(filename, list):
+    def _save_to_column_file(filename: Union[str, Path], list: list[list[str]]) -> None:
         with open(filename, "w", encoding="utf-8") as f:
             for sentence in list:
                 for token in sentence:
@@ -5324,9 +5324,9 @@ def save_to_column_file(filename, list):
                     f.write("\n")
                 f.write("\n")
 
-    def _create_train_dev_splits(self, filename, all_sentences=None, datestring="1996-08-24"):
+    def _create_train_dev_splits(self, filename: Path, all_sentences: list = None, datestring: str ="1996-08-24") -> None:
         if not all_sentences:
-            all_sentences = self.read_column_file(filename)
+            all_sentences = self._read_column_file(filename)
 
         train_sentences = []
         dev_sentences = []
@@ -5345,19 +5345,19 @@ def _create_train_dev_splits(self, filename, all_sentences=None, datestring="199
             else:
                 train_sentences.append(s)
 
-        self.save_to_column_file(
+        self._save_to_column_file(
             filename.parent / f"{filename.stem}.dev",
             dev_sentences,
         )
-        self.save_to_column_file(
+        self._save_to_column_file(
             filename.parent / f"{filename.stem}.train",
             train_sentences,
         )
 
-    def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
+    def _merge_tokens_labels(self, corpus: str, all_clean_sentences: list, token_indices: list) -> list[list[str]]:
         # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file
 
-        noisy_labels = self.read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev")
+        noisy_labels = self._read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev")
         for index, sentence in zip(token_indices, noisy_labels):
 
             if index.strip() == "docstart":
@@ -5371,14 +5371,14 @@ def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
             for token, label in zip(clean_sentence, sentence):
                 label[0] = token[0]  # token[0] -> text, token[1] -> BIO label
         if self.SAVE_TRAINDEV_FILE:
-            self.save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels)
+            self._save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels)
         return noisy_labels
 
-    def generate_data_files(self, filename, origin_dataset_name):
+    def _generate_data_files(self, filename: Union[str, Path], origin_dataset_name: str) -> None:
 
         with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as index_file:
             token_indices = index_file.readlines()
-            all_clean_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train")
+            all_clean_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train")
 
             # os.makedirs(os.path.join('data','noisebench'), exist_ok=True)
 
@@ -5388,7 +5388,7 @@ def generate_data_files(self, filename, origin_dataset_name):
             )
 
         # copy test set
-        all_clean_test_sentences = self.read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test")
+        all_clean_test_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test")
 
         test_sentences = []
         for s in all_clean_test_sentences:
@@ -5397,7 +5397,7 @@ def generate_data_files(self, filename, origin_dataset_name):
                 new_s.append([token[0], token[4]])
             test_sentences.append(new_s)
 
-        self.save_to_column_file(self.base_path / "clean.test", test_sentences)
+        self._save_to_column_file(self.base_path / "clean.test", test_sentences)
 
 
 class MASAKHA_POS(MultiCorpus):

From c7f4a6cd0448386252a55170719ab39fbaf46443 Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 16:20:42 +0100
Subject: [PATCH 11/12] rename some variables and small refactor

---
 flair/datasets/sequence_labeling.py | 37 +++++++++++++++--------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 12da50bd49..76c842a68e 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -5297,28 +5297,30 @@ def __init__(
     def _read_column_file(filename: Union[str, Path]) -> list[list[str]]:
         with open(filename, "r", errors="replace", encoding="utf-8") as file:
             lines = file.readlines()
-            all_x = []
-            point = []
+            all_sentences = []
+            sentence = []
             for line in lines:
                 if "\t" in line.strip():
-                    stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ")
+                    stripped_line = line.strip().split("\t") 
+                else:
+                    stripped_line = line.strip().split(" ")
 
-                point.append(stripped_line)
+                sentence.append(stripped_line)
                 if line.strip() == "":
-                    if len(point[:-1]) > 0:
-                        all_x.append(point[:-1])
-                    point = []
+                    if len(sentence[:-1]) > 0:
+                        all_sentences.append(sentence[:-1])
+                    sentence = []
 
-        if len(point) > 0:
-            all_x.append(point)
+        if len(sentence) > 0:
+            all_sentences.append(sentence)
 
-        all_x = all_x
-        return all_x
+        all_sentences = all_sentences
+        return all_sentences
 
     @staticmethod
-    def _save_to_column_file(filename: Union[str, Path], list: list[list[str]]) -> None:
+    def _save_to_column_file(filename: Union[str, Path], sentences: list[list[str]]) -> None:
         with open(filename, "w", encoding="utf-8") as f:
-            for sentence in list:
+            for sentence in sentences:
                 for token in sentence:
                     f.write("\t".join(token))
                     f.write("\n")
@@ -5391,11 +5393,10 @@ def _generate_data_files(self, filename: Union[str, Path], origin_dataset_name:
         all_clean_test_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test")
 
         test_sentences = []
-        for s in all_clean_test_sentences:
-            new_s = []
-            for token in s:
-                new_s.append([token[0], token[4]])
-            test_sentences.append(new_s)
+
+        for sentence in all_clean_test_sentences:
+            new_sentence = [[tokens[0], tokens[4]] for tokens in sentence]
+            test_sentences.append(new_sentence)
 
         self._save_to_column_file(self.base_path / "clean.test", test_sentences)
 

From 4e159925f961d2475f0694cefaf1840c52f383fd Mon Sep 17 00:00:00 2001
From: elenamer <elenamerdzanovska@yahoo.com>
Date: Fri, 13 Dec 2024 16:35:16 +0100
Subject: [PATCH 12/12] formatting and fix some typing

---
 flair/datasets/sequence_labeling.py | 41 ++++++++++++++---------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 76c842a68e..b2ab2f45dd 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -1,9 +1,9 @@
 import copy
+import gzip
 import json
 import logging
 import os
 import re
-import gzip
 import shutil
 import tarfile
 import tempfile
@@ -5251,12 +5251,13 @@ def __init__(
             in_memory (bool): If True the dataset is kept in memory achieving speedups in training.
             **corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'.
         """
-
         VALUE_NOISE_VALUES = ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]
-        
+
         if noise not in VALUE_NOISE_VALUES:
-            raise ValueError(f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!")
-        
+            raise ValueError(
+                f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!"
+            )
+
         self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)
 
         filename = "clean" if noise == "clean" else f"noise_{noise}"
@@ -5270,16 +5271,13 @@ def __init__(
         if not all(files_exist):
             cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only")
             cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only")
-            
+
             cleanconll_corpus = CLEANCONLL()
 
             self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower()
 
             # create dataset files from index and train/test splits
-            self._generate_data_files(
-                filename,
-                cleanconll_corpus.__class__.__name__.lower()
-            )
+            self._generate_data_files(filename, cleanconll_corpus.__class__.__name__.lower())
 
         super().__init__(
             data_folder=self.base_path,
@@ -5294,16 +5292,13 @@ def __init__(
         )
 
     @staticmethod
-    def _read_column_file(filename: Union[str, Path]) -> list[list[str]]:
-        with open(filename, "r", errors="replace", encoding="utf-8") as file:
+    def _read_column_file(filename: Union[str, Path]) -> list[list[list[str]]]:
+        with open(filename, errors="replace", encoding="utf-8") as file:
             lines = file.readlines()
             all_sentences = []
             sentence = []
             for line in lines:
-                if "\t" in line.strip():
-                    stripped_line = line.strip().split("\t") 
-                else:
-                    stripped_line = line.strip().split(" ")
+                stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ")
 
                 sentence.append(stripped_line)
                 if line.strip() == "":
@@ -5318,7 +5313,7 @@ def _read_column_file(filename: Union[str, Path]) -> list[list[str]]:
         return all_sentences
 
     @staticmethod
-    def _save_to_column_file(filename: Union[str, Path], sentences: list[list[str]]) -> None:
+    def _save_to_column_file(filename: Union[str, Path], sentences: list[list[list[str]]]) -> None:
         with open(filename, "w", encoding="utf-8") as f:
             for sentence in sentences:
                 for token in sentence:
@@ -5326,7 +5321,9 @@ def _save_to_column_file(filename: Union[str, Path], sentences: list[list[str]])
                     f.write("\n")
                 f.write("\n")
 
-    def _create_train_dev_splits(self, filename: Path, all_sentences: list = None, datestring: str ="1996-08-24") -> None:
+    def _create_train_dev_splits(
+        self, filename: Path, all_sentences: Optional[list] = None, datestring: str = "1996-08-24"
+    ) -> None:
         if not all_sentences:
             all_sentences = self._read_column_file(filename)
 
@@ -5356,7 +5353,9 @@ def _create_train_dev_splits(self, filename: Path, all_sentences: list = None, d
             train_sentences,
         )
 
-    def _merge_tokens_labels(self, corpus: str, all_clean_sentences: list, token_indices: list) -> list[list[str]]:
+    def _merge_tokens_labels(
+        self, corpus: str, all_clean_sentences: list, token_indices: list
+    ) -> list[list[list[str]]]:
         # generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file
 
         noisy_labels = self._read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev")
@@ -5376,9 +5375,9 @@ def _merge_tokens_labels(self, corpus: str, all_clean_sentences: list, token_ind
             self._save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels)
         return noisy_labels
 
-    def _generate_data_files(self, filename: Union[str, Path], origin_dataset_name: str) -> None:
+    def _generate_data_files(self, filename: str, origin_dataset_name: str) -> None:
 
-        with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as index_file:
+        with open(self.base_path / "annotations_only" / "index.txt", encoding="utf-8") as index_file:
             token_indices = index_file.readlines()
             all_clean_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train")