-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for NoiseBench #3512
Changes from 5 commits
f82969d
2ede719
a34d2b2
2684e93
0e8183d
565fc29
7903d2d
1893d65
24a2829
ae10a61
be83c9e
c7f4a6c
4e15992
4516da6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4973,6 +4973,209 @@ def _write_instances(cls, version, base_path, split, data): | |
out_file.write("\n") | ||
|
||
|
||
class NER_NOISEBENCH(ColumnCorpus): | ||
label_url = "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/" | ||
SAVE_TRAINDEV_FILE = False | ||
|
||
def __init__( | ||
self, | ||
noise: str = "clean", | ||
base_path: Optional[Union[str, Path]] = None, | ||
in_memory: bool = True, | ||
**corpusargs, | ||
) -> None: | ||
"""Initialize the NoiseBench corpus. | ||
|
||
Args: | ||
noise (string): Chooses the labelset for the data. | ||
clean (default): Clean labels | ||
crowd,crowdbest,expert,distant,weak,llm : Different kinds of noisy labelsets (details: ...) | ||
base_path (Optional[Union[str, Path]]): Path to the data. | ||
Default is None, meaning the corpus gets automatically downloaded and saved. | ||
You can override this by passing a path to a directory containing the unprocessed files but typically this | ||
should not be necessary. | ||
in_memory (bool): If True the dataset is kept in memory achieving speedups in training. | ||
**corpusargs: The arguments propagated to :meth:'flair.datasets.ColumnCorpus.__init__'. | ||
""" | ||
if noise not in ["clean", "crowd", "crowdbest", "expert", "distant", "weak", "llm"]: | ||
raise Exception("Please choose a valid version") | ||
|
||
self._set_path(base_path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove this function and just set the if base_path:
self.base_path = Path(base_path)
else:
self.base_path = flair.cache_root / "datasets" / "noisebench" |
||
|
||
filename = "clean" if noise == "clean" else f"noise_{noise}" | ||
file_paths = [ | ||
self.base_path / f"{filename}.train", | ||
self.base_path / f"{filename}.dev", | ||
self.base_path / "clean.test", | ||
] | ||
files_exist = [path.exists() for path in file_paths] | ||
self.cleanconll_base_path = flair.cache_root / "datasets" / "cleanconll" | ||
|
||
if not all(files_exist): | ||
cached_path(f"{self.label_url}/{filename}.traindev", self.base_path / "annotations_only") | ||
cached_path(f"{self.label_url}/index.txt", self.base_path / "annotations_only") | ||
|
||
cleanconll_files_exist = [ | ||
Path(f"{self.cleanconll_base_path}/cleanconll.{split}").exists() for split in ["train", "dev", "test"] | ||
] | ||
if not all(cleanconll_files_exist): | ||
# download cleanconll | ||
|
||
clone = f"git clone https://github.com/flairNLP/CleanCoNLL.git {self.cleanconll_base_path}/CleanCoNLL" | ||
os.system(clone) # Cloning | ||
cwd = os.getcwd() | ||
|
||
os.chdir(f"{self.cleanconll_base_path}/CleanCoNLL") | ||
chmod = "chmod u+x create_cleanconll_from_conll03.sh" | ||
os.system(chmod) | ||
create = "bash create_cleanconll_from_conll03.sh" | ||
|
||
os.system(create) | ||
os.chdir(cwd) | ||
|
||
shutil.move( | ||
f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.train", | ||
self.cleanconll_base_path, | ||
) | ||
shutil.move( | ||
f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.dev", self.cleanconll_base_path | ||
) | ||
shutil.move( | ||
f"{self.cleanconll_base_path}/CleanCoNLL/data/cleanconll/cleanconll.test", self.cleanconll_base_path | ||
) | ||
|
||
shutil.rmtree(self.cleanconll_base_path / "CleanCoNLL") | ||
|
||
# create dataset files from index and train/test splits | ||
self.generate_data_files( | ||
filename, | ||
) | ||
|
||
super().__init__( | ||
data_folder=self.base_path, | ||
train_file=f"{filename}.train", | ||
dev_file=f"{filename}.dev", | ||
test_file="clean.test", # test set is always clean (without noise) | ||
column_format={0: "text", 1: "ner"}, | ||
in_memory=in_memory, | ||
column_delimiter="\t", | ||
document_separator_token="-DOCSTART-", | ||
**corpusargs, | ||
) | ||
|
||
def _set_path(self, base_path): | ||
self.base_path = flair.cache_root / "datasets" / "noisebench" if not base_path else Path(base_path) | ||
|
||
@staticmethod | ||
def read_column_file(filename): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are adding a new public method to My suggestion is to rename the method to |
||
with open(filename, errors="replace") as file: | ||
lines = file.readlines() | ||
all_x = [] | ||
point = [] | ||
for line in lines: | ||
if "\t" in line.strip(): | ||
stripped_line = line.strip().split("\t") if "\t" in line.strip() else line.strip().split(" ") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like if "\t" in line.strip():
stripped_line = line.strip().split("\t")
else:
stripped_line = line.strip().split(" ") |
||
|
||
point.append(stripped_line) | ||
if line.strip() == "": | ||
if len(point[:-1]) > 0: | ||
all_x.append(point[:-1]) | ||
point = [] | ||
|
||
if len(point) > 0: | ||
all_x.append(point) | ||
|
||
all_x = all_x | ||
return all_x | ||
|
||
@staticmethod | ||
def save_to_column_file(filename, list): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
with open(filename, "w") as f: | ||
for sentence in list: | ||
for token in sentence: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rename the iteration variable to |
||
f.write("\t".join(token)) | ||
f.write("\n") | ||
f.write("\n") | ||
|
||
def _create_train_dev_splits(self, filename, all_sentences=None, datestring="1996-08-24"): | ||
if not all_sentences: | ||
all_sentences = self.read_column_file(filename) | ||
|
||
train_sentences = [] | ||
dev_sentences = [] | ||
for i, s in enumerate(all_sentences): | ||
if "DOCSTART" in s[0][0]: | ||
assert i + 3 < len(all_sentences) # last document is too short | ||
|
||
# news date is usually in 3rd or 4th sentence of each article | ||
if datestring in all_sentences[i + 2][-1][0] or datestring in all_sentences[i + 3][-1][0]: | ||
save_to_dev = True | ||
else: | ||
save_to_dev = False | ||
|
||
if save_to_dev: | ||
dev_sentences.append(s) | ||
else: | ||
train_sentences.append(s) | ||
|
||
self.save_to_column_file( | ||
os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".dev", | ||
dev_sentences, | ||
) | ||
self.save_to_column_file( | ||
os.sep.join(filename.split(os.sep)[:-1]) + os.sep + filename.split(os.sep)[-1].split(".")[0] + ".train", | ||
train_sentences, | ||
) | ||
|
||
def _merge_tokens_labels(self, corpus, all_clean_sentences, token_indices): | ||
# generate NoiseBench dataset variants, given CleanCoNLL, noisy label files and index file | ||
|
||
noisy_labels = self.read_column_file(os.path.join(self.base_path, "annotations_only", f"{corpus}.traindev")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, use |
||
# print(noisy_labels) | ||
# print(token_indices) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove these debugging prints. |
||
for index, sentence in zip(token_indices, noisy_labels): | ||
|
||
if index.strip() == "docstart": | ||
assert len(sentence) == 1 | ||
sentence[0][0] = "-DOCSTART-" | ||
continue | ||
clean_sentence = all_clean_sentences[int(index.strip())] | ||
|
||
assert len(clean_sentence) == len(sentence) # this means indexing is wrong | ||
|
||
for token, label in zip(clean_sentence, sentence): | ||
label[0] = token[0] # token[0] -> text, token[1] -> BIO label | ||
if self.SAVE_TRAINDEV_FILE: | ||
self.save_to_column_file(os.path.join(self.base_path, f"{corpus}.traindev"), noisy_labels) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, use |
||
return noisy_labels | ||
|
||
def generate_data_files(self, filename): | ||
|
||
with open(os.path.join(self.base_path, "annotations_only", "index.txt")) as index_file: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can avoid this with open(self.base_path / "annotations_only" / "index.txt", "r", encoding="utf-8") as fp:
# ... |
||
token_indices = index_file.readlines() | ||
|
||
all_clean_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.train")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, use |
||
|
||
# os.makedirs(os.path.join('data','noisebench'), exist_ok=True) | ||
|
||
noisy_sentences = self._merge_tokens_labels(filename, all_clean_sentences, token_indices) | ||
self._create_train_dev_splits( | ||
all_sentences=noisy_sentences, filename=os.path.join(self.base_path, f"{filename}.traindev") | ||
) | ||
|
||
# copy test set | ||
all_clean_test_sentences = self.read_column_file(os.path.join(self.cleanconll_base_path, "cleanconll.test")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, use |
||
|
||
test_sentences = [] | ||
for s in all_clean_test_sentences: | ||
new_s = [] | ||
for token in s: | ||
new_s.append([token[0], token[4]]) | ||
test_sentences.append(new_s) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be refactored to a more idiomatic approach with list comprehensions: for sentence in all_clean_test_sentences:
new_sentence = [[tokens[0], tokens[4]] for tokens in sentence]
test_sentences.append(new_sentence) |
||
|
||
self.save_to_column_file(os.path.join(self.base_path, "clean.test"), test_sentences) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, use |
||
|
||
|
||
class MASAKHA_POS(MultiCorpus): | ||
def __init__( | ||
self, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is recommended to raise ValueError for wrong and unexpected values passed to the argument.
Also, an error message would be more helpful if it listed valid values. Something like: