Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for NoiseBench #3512

Merged
merged 14 commits into from
Dec 19, 2024
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@
NER_MULTI_WIKINER,
NER_MULTI_XTREME,
NER_NERMUD,
NER_NOISEBENCH,
NER_SWEDISH,
NER_TURKU,
NER_UKRAINIAN,
Expand Down Expand Up @@ -496,6 +497,7 @@
"NER_GERMAN_MOBIE",
"NER_GERMAN_POLITICS",
"NER_HIPE_2022",
"NER_NOISEBENCH",
"NER_HUNGARIAN",
"NER_ICDAR_EUROPEANA",
"NER_ICELANDIC",
Expand Down
175 changes: 174 additions & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import copy
import gzip
import json
import logging
import os
import re
import gzip
import shutil
import tarfile
import tempfile
Expand Down Expand Up @@ -5227,6 +5227,179 @@ def _write_instances(cls, version, base_path, split, data):
out_file.write("\n")


class NER_NOISEBENCH(ColumnCorpus):
label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/"
SAVE_TRAINDEV_FILE = False

def __init__(
self,
noise: str = "clean",
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the NoiseBench corpus.

Args:
noise (string): Chooses the labelset for the data.
clean (default): Clean labels
crowd,crowdbest,expert,distant,weak,llm : Different kinds of noisy labelsets (details: ...)
base_path (Optional[Union[str, Path]]): Path to the data.
Default is None, meaning the corpus gets automatically downloaded and saved.
You can override this by passing a path to a directory containing the unprocessed files but typically this
should not be necessary.
in_memory (bool): If True the dataset is kept in memory achieving speedups in training.
**corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'.
"""
VALUE_NOISE_VALUES = ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]

if noise not in VALUE_NOISE_VALUES:
raise ValueError(
f"Unsupported value for noise type argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!"
)

self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)

filename = "clean" if noise == "clean" else f"noise_{noise}"
file_paths = [
self.base_path / f"{filename}.train",
self.base_path / f"{filename}.dev",
self.base_path / "clean.test",
]
files_exist = [path.exists() for path in file_paths]

if not all(files_exist):
cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only")
cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only")

cleanconll_corpus = CLEANCONLL()

self.cleanconll_base_path = flair.cache_root / "datasets" / cleanconll_corpus.__class__.__name__.lower()

# create dataset files from index and train/test splits
self._generate_data_files(filename, cleanconll_corpus.__class__.__name__.lower())

super().__init__(
data_folder=self.base_path,
train_file=f"{filename}.train",
dev_file=f"{filename}.dev",
test_file="clean.test", # test set is always clean (without noise)
column_format={0: "text", 1: "ner"},
in_memory=in_memory,
column_delimiter="\t",
document_separator_token="-DOCSTART-",
**corpusargs,
)

@staticmethod
def _read_column_file(filename: Union[str, Path]) -> list[list[list[str]]]:
with open(filename, errors="replace", encoding="utf-8") as file:
lines = file.readlines()
all_sentences = []
sentence = []
for line in lines:
stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ")

sentence.append(stripped_line)
if line.strip() == "":
if len(sentence[:-1]) > 0:
all_sentences.append(sentence[:-1])
sentence = []

if len(sentence) > 0:
all_sentences.append(sentence)

all_sentences = all_sentences
return all_sentences

@staticmethod
def _save_to_column_file(filename: Union[str, Path], sentences: list[list[list[str]]]) -> None:
with open(filename, "w", encoding="utf-8") as f:
for sentence in sentences:
for token in sentence:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename the iteration variable to tokens to convert the meaning that this is a container of elements and not a single token.

f.write("\t".join(token))
f.write("\n")
f.write("\n")

def _create_train_dev_splits(
self, filename: Path, all_sentences: Optional[list] = None, datestring: str = "1996-08-24"
) -> None:
if not all_sentences:
all_sentences = self._read_column_file(filename)

train_sentences = []
dev_sentences = []
for i, s in enumerate(all_sentences):
if "DOCSTART" in s[0][0]:
assert i + 3 < len(all_sentences) # last document is too short

# news date is usually in 3rd or 4th sentence of each article
if datestring in all_sentences[i + 2][-1][0] or datestring in all_sentences[i + 3][-1][0]:
save_to_dev = True
else:
save_to_dev = False

if save_to_dev:
dev_sentences.append(s)
else:
train_sentences.append(s)

self._save_to_column_file(
filename.parent / f"{filename.stem}.dev",
dev_sentences,
)
self._save_to_column_file(
filename.parent / f"{filename.stem}.train",
train_sentences,
)

def _merge_tokens_labels(
self, corpus: str, all_clean_sentences: list, token_indices: list
) -> list[list[list[str]]]:
# generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file

noisy_labels = self._read_column_file(self.base_path / "annotations_only" / f"{corpus}.traindev")
for index, sentence in zip(token_indices, noisy_labels):

if index.strip() == "docstart":
assert len(sentence) == 1
sentence[0][0] = "-DOCSTART-"
continue
clean_sentence = all_clean_sentences[int(index.strip())]

assert len(clean_sentence) == len(sentence) # this means indexing is wrong

for token, label in zip(clean_sentence, sentence):
label[0] = token[0] # token[0] -> text, token[1] -> BIO label
if self.SAVE_TRAINDEV_FILE:
self._save_to_column_file(self.base_path / f"{corpus}.traindev", noisy_labels)
return noisy_labels

def _generate_data_files(self, filename: str, origin_dataset_name: str) -> None:

with open(self.base_path / "annotations_only" / "index.txt", encoding="utf-8") as index_file:
token_indices = index_file.readlines()
all_clean_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.train")

# os.makedirs(os.path.join('data','noisebench'), exist_ok=True)

noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices)
self._create_train_dev_splits(
all_sentences=noisy_sentences, filename=self.base_path / f"{filename}.traindev"
)

# copy test set
all_clean_test_sentences = self._read_column_file(self.cleanconll_base_path / f"{origin_dataset_name}.test")

test_sentences = []

for sentence in all_clean_test_sentences:
new_sentence = [[tokens[0], tokens[4]] for tokens in sentence]
test_sentences.append(new_sentence)

self._save_to_column_file(self.base_path / "clean.test", test_sentences)


class MASAKHA_POS(MultiCorpus):
def __init__(
self,
Expand Down
Loading