From a60a8d1b781c55b2b004b6c8f94cbd371f723b24 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 12 Aug 2024 13:37:21 +0200 Subject: [PATCH 1/6] add requirements to doc build --- docs/requirements.txt | 7 +++- scripts/span_tagger.py | 22 ++++++++++++ scripts/train_span_tagger_multi.py | 58 ++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 scripts/span_tagger.py create mode 100644 scripts/train_span_tagger_multi.py diff --git a/docs/requirements.txt b/docs/requirements.txt index 8d7ae05d70..0e8c4f6141 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,4 +5,9 @@ sphinx importlib-metadata sphinx-multiversion pydata-sphinx-theme<0.14 -sphinx_design \ No newline at end of file +sphinx_design + +# previous dependencies that are required to build docs for later versions too. +semver +gensim +bpemb \ No newline at end of file diff --git a/scripts/span_tagger.py b/scripts/span_tagger.py new file mode 100644 index 0000000000..a09889eaa8 --- /dev/null +++ b/scripts/span_tagger.py @@ -0,0 +1,22 @@ +from flair.data import Sentence +from flair.models import MultitaskModel + +# For comparison: This works since the label type is "ner" for both models in the multitask model +classifier: MultitaskModel = MultitaskModel.load("zelda") + +sentence = Sentence("Kirk and Spock met on the Enterprise") + +classifier.predict(sentence) + +print(sentence) + +# Giving them sensible label names, now made possible with this PR +classifier.tasks["Task_1"]._label_type = "nel" +classifier.tasks["Task_1"]._span_label_type = "ner" + +# However, this no longer makes predictions +sentence = Sentence("Kirk and Spock met on the Enterprise") + +classifier.predict(sentence) + +print(sentence) diff --git a/scripts/train_span_tagger_multi.py b/scripts/train_span_tagger_multi.py new file mode 100644 index 0000000000..0798a8b47b --- /dev/null +++ b/scripts/train_span_tagger_multi.py @@ -0,0 +1,58 @@ +from flair.datasets import NER_MULTI_WIKINER, ZELDA +from flair.embeddings import TransformerWordEmbeddings +from flair.models import SequenceTagger, SpanClassifier +from flair.models.entity_linker_model import CandidateGenerator +from flair.trainers import ModelTrainer +from flair.nn import PrototypicalDecoder +from flair.nn.multitask import make_multitask_model_and_corpus + +# 1. get the corpus +ner_corpus = NER_MULTI_WIKINER().downsample(0.001) +nel_corpus = ( + ZELDA(column_format={0: "text", 2: "nel"}) + .downsample(0.0001, downsample_dev=False, downsample_test=False) + .downsample(0.01, downsample_train=False) +) # need to set the label type to be the same as the ner one + +# --- Embeddings that are shared by both models --- # +shared_embeddings = TransformerWordEmbeddings("distilbert-base-uncased", fine_tune=True) + +ner_label_dict = ner_corpus.make_label_dictionary("ner", add_unk=False) + +ner_model = SequenceTagger( + embeddings=shared_embeddings, + tag_dictionary=ner_label_dict, + tag_type="ner", + use_rnn=False, + use_crf=False, + reproject_embeddings=False, +) + + +nel_label_dict = nel_corpus.make_label_dictionary("nel", add_unk=True) + +nel_model = SpanClassifier( + embeddings=shared_embeddings, + label_dictionary=nel_label_dict, + label_type="nel", + span_label_type="ner", + decoder=PrototypicalDecoder( + num_prototypes=len(nel_label_dict), + embeddings_size=shared_embeddings.embedding_length * 2, # we use "first_last" encoding for spans + distance_function="dot_product", + ), + candidates=CandidateGenerator("zelda"), +) + + +# -- Define mapping (which tagger should train on which model) -- # +multitask_model, multicorpus = make_multitask_model_and_corpus( + [ + (ner_model, ner_corpus), + (nel_model, nel_corpus), + ] +) + +# -- Create model trainer and train -- # +trainer = ModelTrainer(multitask_model, multicorpus) +trainer.fine_tune(f"resources/taggers/zelda_with_mention", mini_batch_chunk_size=1) From ee470ee2962443cdde1221c1078ab7baaa8243a8 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 16 Aug 2024 13:29:33 +0200 Subject: [PATCH 2/6] invalidate old cache --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c801b96a70..5b3633d93e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: uses: actions/cache@v3 with: path: ./cache - key: cache-v1.1 + key: cache-v1.2 - name: Run tests run: | python -c 'import flair' From 0ab903639f5e6126d7ebed3bf4623278bca5e48b Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 16 Aug 2024 14:15:32 +0200 Subject: [PATCH 3/6] remove skipped tests and unused test symbols --- tests/test_datasets_biomedical.py | 234 ------------------------------ 1 file changed, 234 deletions(-) diff --git a/tests/test_datasets_biomedical.py b/tests/test_datasets_biomedical.py index 4099bb9288..815e2f8bf6 100644 --- a/tests/test_datasets_biomedical.py +++ b/tests/test_datasets_biomedical.py @@ -26,80 +26,6 @@ logger.propagate = True -def has_balanced_parantheses(text: str) -> bool: - stack = [] - opening = ["(", "[", "{"] - closing = [")", "]", "}"] - for c in text: - if c in opening: - stack.append(c) - elif c in closing: - if not stack: - return False - last_paren = stack.pop() - if opening.index(last_paren) != closing.index(c): - return False - - return len(stack) == 0 - - -def gene_predicate(member): - return inspect.isclass(member) and "HUNER_GENE_" in str(member) - - -def chemical_predicate(member): - return inspect.isclass(member) and "HUNER_CHEMICAL_" in str(member) - - -def disease_predicate(member): - return inspect.isclass(member) and "HUNER_DISEASE_" in str(member) - - -def species_predicate(member): - return inspect.isclass(member) and "HUNER_SPECIES_" in str(member) - - -def cellline_predicate(member): - return inspect.isclass(member) and "HUNER_CELL_LINE_" in str(member) - - -CELLLINE_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=cellline_predicate), key=itemgetter(0)) -] -CHEMICAL_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=chemical_predicate), key=itemgetter(0)) -] -DISEASE_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=disease_predicate), key=itemgetter(0)) -] -GENE_DATASETS = [i[1] for i in sorted(inspect.getmembers(biomedical, predicate=gene_predicate), key=itemgetter(0))] -SPECIES_DATASETS = [ - i[1] for i in sorted(inspect.getmembers(biomedical, predicate=species_predicate), key=itemgetter(0)) -] -ALL_DATASETS = CELLLINE_DATASETS + CHEMICAL_DATASETS + DISEASE_DATASETS + GENE_DATASETS + SPECIES_DATASETS - - -def simple_tokenizer(text: str) -> List[str]: - tokens: List[str] = [] - word = "" - index = -1 - for index, char in enumerate(text): - if char == " " or char == "-": - if len(word) > 0: - tokens.append(word) - - word = "" - else: - word += char - - # increment for last token in sentence if not followed by whitespace - index += 1 - if len(word) > 0: - tokens.append(word) - - return tokens - - def test_write_to_conll(): text = "This is entity1 entity2 and a long entity3" dataset = InternalBioNerDataset( @@ -220,163 +146,3 @@ def test_filter_nested_entities(caplog): sorted(entities, key=lambda x: str(x)), ): assert str(e1) == str(e2) - - -def sanity_check_all_corpora(check: Callable[[ColumnCorpus], None]): - for _, CorpusType in tqdm(ALL_DATASETS): - corpus = CorpusType() - check(corpus) - - -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -def test_sanity_not_starting_with_minus(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - entities_starting_with_minus = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - for entity in entities: - if str(entity.tokens[0].text).startswith("-"): - entities_starting_with_minus.append(" ".join([t.text for t in entity.tokens])) - - assert len(entities_starting_with_minus) == 0, "|".join(entities_starting_with_minus) - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_repeating_Bs(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - longest_repeat_tokens: List[Token] = [] - repeat_tokens: List[Token] = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - for token in sentence.tokens: - if token.get_labels()[0].value.startswith("B") or token.get_labels()[0].value.startswith("S"): - repeat_tokens.append(token) - else: - if len(repeat_tokens) > len(longest_repeat_tokens): - longest_repeat_tokens = repeat_tokens - repeat_tokens = [] - - assert len(longest_repeat_tokens) < 4 - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_long_entities(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - longest_entity: List[str] = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - for entity in entities: - if len(entity.tokens) > len(longest_entity): - longest_entity = [t.text for t in entity.tokens] - - assert len(longest_entity) < 10, " ".join(longest_entity) - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_unmatched_parentheses(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - unbalanced_entities = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - for entity in entities: - entity_text = "".join(t.text for t in entity.tokens) - if not has_balanced_parantheses(entity_text): - unbalanced_entities.append(entity_text) - - assert unbalanced_entities == [] - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_not_too_many_entities(CorpusType: Type[ColumnCorpus]): - corpus = CorpusType() # type: ignore[call-arg] - n_entities_per_sentence = [] - for sentence in _iter_dataset(corpus.get_all_sentences()): - entities = sentence.get_spans("ner") - n_entities_per_sentence.append(len(entities)) - avg_entities_per_sentence = sum(n_entities_per_sentence) / len(n_entities_per_sentence) - - assert avg_entities_per_sentence <= 5 - - -@pytest.mark.parametrize("CorpusType", ALL_DATASETS) -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_sanity_no_misaligned_entities(CorpusType: Type[HunerDataset]): - dataset_name = CorpusType.__class__.__name__.lower() - base_path = flair.cache_root / "datasets" - data_folder = base_path / dataset_name - - corpus = CorpusType() - internal = corpus.to_internal(data_folder) - for doc_id, _doc_text in internal.documents.items(): - misaligned_starts = [] - misaligned_ends: List[int] = [] - - entities = internal.entities_per_document[doc_id] - entity_starts = [i.char_span.start for i in entities] - entity_ends = [i.char_span.stop for i in entities] - - for start in entity_starts: - if start not in entity_starts: - misaligned_starts.append(start) - - for end in entity_ends: - if end not in entity_ends: - misaligned_starts.append(end) - - assert len(misaligned_starts) <= len(entities) // 10 - assert len(misaligned_ends) <= len(entities) // 10 - - -@pytest.mark.skip(reason="We skip this test because it's only relevant for development purposes") -def test_scispacy_tokenization(): - from flair.tokenization import SciSpacyTokenizer - - spacy_tokenizer = SciSpacyTokenizer() - - sentence = Sentence("HBeAg(+) patients", use_tokenizer=spacy_tokenizer) - assert len(sentence) == 5 - assert sentence[0].text == "HBeAg" - assert sentence[0].start_position == 0 - assert sentence[1].text == "(" - assert sentence[1].start_position == 5 - assert sentence[2].text == "+" - assert sentence[2].start_position == 6 - assert sentence[3].text == ")" - assert sentence[3].start_position == 7 - assert sentence[4].text == "patients" - assert sentence[4].start_position == 9 - - sentence = Sentence("HBeAg(+)/HBsAg(+)", use_tokenizer=spacy_tokenizer) - assert len(sentence) == 9 - - assert sentence[0].text == "HBeAg" - assert sentence[0].start_position == 0 - assert sentence[1].text == "(" - assert sentence[1].start_position == 5 - assert sentence[2].text == "+" - assert sentence[2].start_position == 6 - assert sentence[3].text == ")" - assert sentence[3].start_position == 7 - assert sentence[4].text == "/" - assert sentence[4].start_position == 8 - assert sentence[5].text == "HBsAg" - assert sentence[5].start_position == 9 - assert sentence[6].text == "(" - assert sentence[6].start_position == 14 - assert sentence[7].text == "+" - assert sentence[7].start_position == 15 - assert sentence[8].text == ")" - assert sentence[8].start_position == 16 - - sentence = Sentence("doxorubicin (DOX)-induced", use_tokenizer=spacy_tokenizer) - - assert len(sentence) == 5 - assert sentence[0].text == "doxorubicin" - assert sentence[1].text == "(" - assert sentence[2].text == "DOX" - assert sentence[3].text == ")" - assert sentence[4].text == "-induced" From f13f1894a2616d8201e33c5ce3a194b133fbd027 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 16 Aug 2024 14:36:17 +0200 Subject: [PATCH 4/6] remove skipped tests and unused test symbols --- tests/test_datasets_biomedical.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tests/test_datasets_biomedical.py b/tests/test_datasets_biomedical.py index 815e2f8bf6..0264b08394 100644 --- a/tests/test_datasets_biomedical.py +++ b/tests/test_datasets_biomedical.py @@ -1,21 +1,12 @@ -import inspect import logging import os import tempfile -from operator import itemgetter from pathlib import Path -from typing import Callable, List, Optional, Type +from typing import List, Optional -import pytest -from tqdm import tqdm - -import flair -from flair.data import Sentence, Token, _iter_dataset -from flair.datasets import ColumnCorpus, biomedical from flair.datasets.biomedical import ( CoNLLWriter, Entity, - HunerDataset, InternalBioNerDataset, filter_nested_entities, ) From 552e6b919591a167b9d00d971a50b1389cf40f00 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 19 Aug 2024 19:44:44 +0200 Subject: [PATCH 5/6] remove scripts that shouldn't be commited --- scripts/span_tagger.py | 22 ------------ scripts/train_span_tagger_multi.py | 58 ------------------------------ 2 files changed, 80 deletions(-) delete mode 100644 scripts/span_tagger.py delete mode 100644 scripts/train_span_tagger_multi.py diff --git a/scripts/span_tagger.py b/scripts/span_tagger.py deleted file mode 100644 index a09889eaa8..0000000000 --- a/scripts/span_tagger.py +++ /dev/null @@ -1,22 +0,0 @@ -from flair.data import Sentence -from flair.models import MultitaskModel - -# For comparison: This works since the label type is "ner" for both models in the multitask model -classifier: MultitaskModel = MultitaskModel.load("zelda") - -sentence = Sentence("Kirk and Spock met on the Enterprise") - -classifier.predict(sentence) - -print(sentence) - -# Giving them sensible label names, now made possible with this PR -classifier.tasks["Task_1"]._label_type = "nel" -classifier.tasks["Task_1"]._span_label_type = "ner" - -# However, this no longer makes predictions -sentence = Sentence("Kirk and Spock met on the Enterprise") - -classifier.predict(sentence) - -print(sentence) diff --git a/scripts/train_span_tagger_multi.py b/scripts/train_span_tagger_multi.py deleted file mode 100644 index 0798a8b47b..0000000000 --- a/scripts/train_span_tagger_multi.py +++ /dev/null @@ -1,58 +0,0 @@ -from flair.datasets import NER_MULTI_WIKINER, ZELDA -from flair.embeddings import TransformerWordEmbeddings -from flair.models import SequenceTagger, SpanClassifier -from flair.models.entity_linker_model import CandidateGenerator -from flair.trainers import ModelTrainer -from flair.nn import PrototypicalDecoder -from flair.nn.multitask import make_multitask_model_and_corpus - -# 1. get the corpus -ner_corpus = NER_MULTI_WIKINER().downsample(0.001) -nel_corpus = ( - ZELDA(column_format={0: "text", 2: "nel"}) - .downsample(0.0001, downsample_dev=False, downsample_test=False) - .downsample(0.01, downsample_train=False) -) # need to set the label type to be the same as the ner one - -# --- Embeddings that are shared by both models --- # -shared_embeddings = TransformerWordEmbeddings("distilbert-base-uncased", fine_tune=True) - -ner_label_dict = ner_corpus.make_label_dictionary("ner", add_unk=False) - -ner_model = SequenceTagger( - embeddings=shared_embeddings, - tag_dictionary=ner_label_dict, - tag_type="ner", - use_rnn=False, - use_crf=False, - reproject_embeddings=False, -) - - -nel_label_dict = nel_corpus.make_label_dictionary("nel", add_unk=True) - -nel_model = SpanClassifier( - embeddings=shared_embeddings, - label_dictionary=nel_label_dict, - label_type="nel", - span_label_type="ner", - decoder=PrototypicalDecoder( - num_prototypes=len(nel_label_dict), - embeddings_size=shared_embeddings.embedding_length * 2, # we use "first_last" encoding for spans - distance_function="dot_product", - ), - candidates=CandidateGenerator("zelda"), -) - - -# -- Define mapping (which tagger should train on which model) -- # -multitask_model, multicorpus = make_multitask_model_and_corpus( - [ - (ner_model, ner_corpus), - (nel_model, nel_corpus), - ] -) - -# -- Create model trainer and train -- # -trainer = ModelTrainer(multitask_model, multicorpus) -trainer.fine_tune(f"resources/taggers/zelda_with_mention", mini_batch_chunk_size=1) From 975931ce99c9ab49d1ee103f01552b2676ea7489 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 19 Aug 2024 19:45:14 +0200 Subject: [PATCH 6/6] add trailing whitespace to .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index cbd5be92d3..89a4bb39e7 100644 --- a/.gitignore +++ b/.gitignore @@ -110,4 +110,4 @@ resources/taggers/ regression_train/ /doc_build/ -scripts/ \ No newline at end of file +scripts/