Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for NoiseBench #3512

Merged
merged 14 commits into from
Dec 19, 2024
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@
NER_MULTI_WIKINER,
NER_MULTI_XTREME,
NER_NERMUD,
NER_NOISEBENCH,
NER_SWEDISH,
NER_TURKU,
NER_UKRAINIAN,
Expand Down Expand Up @@ -494,6 +495,7 @@
"NER_GERMAN_MOBIE",
"NER_GERMAN_POLITICS",
"NER_HIPE_2022",
"NER_NOISEBENCH",
"NER_HUNGARIAN",
"NER_ICDAR_EUROPEANA",
"NER_ICELANDIC",
Expand Down
203 changes: 203 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4973,6 +4973,209 @@ def _write_instances(cls, version, base_path, split, data):
out_file.write("\n")


class NER_NOISEBENCH(ColumnCorpus):
label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/"
SAVE_TRAINDEV_FILE = False

def __init__(
self,
noise: str = "clean",
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the NoiseBench corpus.

Args:
noise (string): Chooses the labelset for the data.
clean (default): Clean labels
crowd,crowdbest,expert,distant,weak,llm : Different kinds of noisy labelsets (details: ...)
base_path (Optional[Union[str, Path]]): Path to the data.
Default is None, meaning the corpus gets automatically downloaded and saved.
You can override this by passing a path to a directory containing the unprocessed files but typically this
should not be necessary.
in_memory (bool): If True the dataset is kept in memory achieving speedups in training.
**corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'.
"""
if noise not in ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]:
raise Exception("Please choose a valid version")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is recommended to raise ValueError for wrong and unexpected values passed to the argument.

Also, an error message would be more helpful if it listed valid values. Something like:

if noise not in VALID_NOISE_VALUES:
    raise ValueError(f"Unsupported value for noise argument. Got {noise}, expected one of {VALUE_NOISE_VALUES}!")


self._set_path(base_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove this function and just set the base_path here in __init__:

if base_path:
    self.base_path = Path(base_path)
else:
    self.base_path = flair.cache_root / "datasets" / "noisebench"


filename = "clean" if noise == "clean" else f"noise_{noise}"
file_paths = [
self.base_path / f"{filename}.train",
self.base_path / f"{filename}.dev",
self.base_path / "clean.test",
]
files_exist = [path.exists() for path in file_paths]
self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll"

if not all(files_exist):
cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only")
cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only")

cleanconll_files_exist = [
Path(f"{self.cleanconll_base_path}/cleanconll.{split}").exists() for split in ["train", "dev", "test"]
]
if not all(cleanconll_files_exist):
# download cleanconll

clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL"
os.system(clone) # Cloning
cwd = os.getcwd()

os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL")
chmod = "chmod u+x create_cleanconll_from_conll03.sh"
os.system(chmod)
create = "bash create_cleanconll_from_conll03.sh"

os.system(create)
os.chdir(cwd)

shutil.move(
f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train",
self.cleanconll_base_path,
)
shutil.move(
f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev", self.cleanconll_base_path
)
shutil.move(
f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test", self.cleanconll_base_path
)

shutil.rmtree(self.cleanconll_base_path / "CleanCoNLL")

# create dataset files from index and train/test splits
self.generate_data_files(
filename,
)

super().__init__(
data_folder=self.base_path,
train_file=f"{filename}.train",
dev_file=f"{filename}.dev",
test_file="clean.test", # test set is always clean (without noise)
column_format={0: "text", 1: "ner"},
in_memory=in_memory,
column_delimiter="\t",
document_separator_token="-DOCSTART-",
**corpusargs,
)

def _set_path(self, base_path):
self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path)

@staticmethod
def read_column_file(filename):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are adding a new public method to NER_NOISEBENCH. I would avoid this. Also, you are relying on this method only in non-public methods in the class (except in generate_data_files, which I would also put as non-public.)

My suggestion is to rename the method to _read_column_file and add type annotations for filename argument.

with open(filename, errors="replace") as file:
lines = file.readlines()
all_x = []
point = []
for line in lines:
if "\t" in line.strip():
stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like stripped_line = line.strip().split("\t") if "\t" in line.strip() will always be set due to the condition if "\t" in line.strip() in line 5076. Refactor to:

if "\t" in line.strip():
    stripped_line = line.strip().split("\t")
else:
    stripped_line = line.strip().split(" ")


point.append(stripped_line)
if line.strip() == "":
if len(point[:-1]) > 0:
all_x.append(point[:-1])
point = []

if len(point) > 0:
all_x.append(point)

all_x = all_x
return all_x

@staticmethod
def save_to_column_file(filename, list):
Copy link
Contributor

@fkdosilovic fkdosilovic Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Rename to _save_to_column_file
  2. Add type annotations to arguments
  3. Avoid using the name of built-in modules/types/etc. for argument and variable names. Here, you are using list as the name for the argument. Seems like a more appropriate name is sentences (which is of type list).

with open(filename, "w") as f:
for sentence in list:
for token in sentence:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename the iteration variable to tokens to convert the meaning that this is a container of elements and not a single token.

f.write("\t".join(token))
f.write("\n")
f.write("\n")

def _create_train_dev_splits(self, filename, all_sentences=None, datestring="1996-08-24"):
if not all_sentences:
all_sentences = self.read_column_file(filename)

train_sentences = []
dev_sentences = []
for i, s in enumerate(all_sentences):
if "DOCSTART" in s[0][0]:
assert i + 3 < len(all_sentences) # last document is too short

# news date is usually in 3rd or 4th sentence of each article
if datestring in all_sentences[i + 2][-1][0] or datestring in all_sentences[i + 3][-1][0]:
save_to_dev = True
else:
save_to_dev = False

if save_to_dev:
dev_sentences.append(s)
else:
train_sentences.append(s)

self.save_to_column_file(
os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".dev",
dev_sentences,
)
self.save_to_column_file(
os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".train",
train_sentences,
)

def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices):
# generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file

noisy_labels = self.read_column_file(os.path.join(self.base_path, "annotations_only", f"{corpus}.traindev"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, use self.base_path / "annotations_only" / f"{corpus}.traindev" instead of os.path.join.

# print(noisy_labels)
# print(token_indices)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove these debugging prints.

for index, sentence in zip(token_indices, noisy_labels):

if index.strip() == "docstart":
assert len(sentence) == 1
sentence[0][0] = "-DOCSTART-"
continue
clean_sentence = all_clean_sentences[int(index.strip())]

assert len(clean_sentence) == len(sentence) # this means indexing is wrong

for token, label in zip(clean_sentence, sentence):
label[0] = token[0] # token[0] -> text, token[1] -> BIO label
if self.SAVE_TRAINDEV_FILE:
self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, use self.base_path / f"{corpus}.traindev" instead of os.path.join.

return noisy_labels

def generate_data_files(self, filename):

with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can avoid this os.path.join since base_path is of type Path. Just use / to concatenate the path. Also, we should avoid relying on implicit default values, it is better to explicitly mention that we want to read the file with utf-8:

with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as fp:
    # ...

token_indices = index_file.readlines()

all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.train"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, use self.cleanconll_base_path / "cleanconll.train" instead of os.path.join.


# os.makedirs(os.path.join('data','noisebench'), exist_ok=True)

noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices)
self._create_train_dev_splits(
all_sentences=noisy_sentences, filename=os.path.join(self.base_path, f"{filename}.traindev")
)

# copy test set
all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.test"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, use self.cleanconll_base_path / "cleanconll.test" instead of os.path.join.


test_sentences = []
for s in all_clean_test_sentences:
new_s = []
for token in s:
new_s.append([token[0], token[4]])
test_sentences.append(new_s)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be refactored to a more idiomatic approach with list comprehensions:

for sentence in all_clean_test_sentences:
    new_sentence = [[tokens[0], tokens[4]] for tokens in sentence]
    test_sentences.append(new_sentence)


self.save_to_column_file(os.path.join(self.base_path, "clean.test"), test_sentences)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, use self.base_path / "clean.test" instead of os.path.join.



class MASAKHA_POS(MultiCorpus):
def __init__(
self,
Expand Down
Loading