diff --git a/CHANGELOG.md b/CHANGELOG.md index e3999b3..feca264 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 0.1.2 (2023-10-03) + +### Added + +- Parsing qualifier/meta annotation info + ## 0.1.1 (2023-09-27) ### Added - Classes for `Annotation`, `Document` and `Dataset` - Parsing of `clinlp` output -- Parsing of `mecattrainer` output +- Parsing of `medcattrainer` output ## 0.1.0 (2023-08-14) diff --git a/README.md b/README.md index 9f072e1..7c274b5 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ pip install -e . A small example to get started: ```python -from clin_nlp_metrics.metrics import Dataset +from clin_nlp_metrics.dataset import Dataset import json # medcattrainer @@ -23,7 +23,7 @@ import json with open('medcattrainer_export.json', 'rb') as f: mtrainer_data = json.load(f) - + d1 = Dataset.from_medcattrainer(mctrainer_data) # clinlp @@ -31,6 +31,7 @@ import clinlp import spacy from model import get_model # not included + nlp = get_model() nlp_docs = nlp.pipe([doc['text'] for doc in data['projects'][0]['documents']]) diff --git a/pyproject.toml b/pyproject.toml index 2e6b746..15aec1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "clin-nlp-metrics" -version = "0.1.1" +version = "0.1.2" authors = [ { name="Richard Bartels", email="r.t.bartels-6@umcutrecht.nl" }, ] @@ -15,7 +15,7 @@ classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] -dependencies = ["spacy~=3.0", "clinlp~=0.5"] +dependencies = ["spacy~=3.0", "clinlp>=0.6"] [project.optional-dependencies] test = ["pytest", "pytest-cov"] diff --git a/src/clin_nlp_metrics/__init__.py b/src/clin_nlp_metrics/__init__.py index e69de29..34aa12a 100644 --- a/src/clin_nlp_metrics/__init__.py +++ b/src/clin_nlp_metrics/__init__.py @@ -0,0 +1,3 @@ +from .dataset import Dataset + +__all__ = ["Dataset"] diff --git a/src/clin_nlp_metrics/metrics.py b/src/clin_nlp_metrics/dataset.py similarity index 82% rename from src/clin_nlp_metrics/metrics.py rename to src/clin_nlp_metrics/dataset.py index 048724a..4a83209 100644 --- a/src/clin_nlp_metrics/metrics.py +++ b/src/clin_nlp_metrics/dataset.py @@ -23,7 +23,7 @@ class Annotation: label: str """ The label/tag""" - qualifiers: Optional[list[str]] = None + qualifiers: Optional[list[dict]] = None """ Optionally, a list of qualifiers""" def to_nervaluate(self) -> dict: @@ -33,7 +33,6 @@ def to_nervaluate(self) -> dict: Returns ------- A dictionary with the items nervaluate expects. - """ return {"start": self.start, "end": self.end, "label": self.label} @@ -96,12 +95,24 @@ def from_clinlp_docs( annotations = [] for ent in doc.ents: + qualifiers = [] + + for qualifier in ent._.qualifiers_dict: + qualifiers.append( + { + "name": qualifier["name"].title(), + "value": qualifier["value"].title(), + "is_default": qualifier["is_default"], + } + ) + annotations.append( Annotation( text=str(ent), start=ent.start_char, end=ent.end_char, label=ent.label_, + qualifiers=qualifiers, ) ) @@ -142,12 +153,23 @@ def from_medcattrainer(data: dict) -> "Dataset": for annotation in doc["annotations"]: if not annotation["deleted"]: + qualifiers = [] + + for qualifier in annotation["meta_anns"].values(): + qualifiers.append( + { + "name": qualifier["name"].title(), + "value": qualifier["value"].title(), + } + ) + annotations.append( Annotation( text=annotation["value"], start=annotation["start"], end=annotation["end"], label=annotation["cui"], + qualifiers=qualifiers, ) ) diff --git a/tests/data/clinlp_docs.pickle b/tests/data/clinlp_docs.pickle index a166a6b..86d2fc1 100644 Binary files a/tests/data/clinlp_docs.pickle and b/tests/data/clinlp_docs.pickle differ diff --git a/tests/data/medcattrainer_export.json b/tests/data/medcattrainer_export.json index ab0c6ce..0d706c1 100644 --- a/tests/data/medcattrainer_export.json +++ b/tests/data/medcattrainer_export.json @@ -20,7 +20,7 @@ "end": 1068, "validated": true, "correct": true, - "deleted": false, + "deleted": true, "alternative": false, "killed": false, "irrelevant": false, @@ -87,7 +87,7 @@ }, "Negation": { "name": "Negation", - "value": "Affirmed", + "value": "NEGATED", "acc": 1.0, "validated": true }, diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 5347893..4007dac 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -3,7 +3,7 @@ import clinlp # noqa: F401 -from clin_nlp_metrics.metrics import Annotation, Dataset, Document +from clin_nlp_metrics.dataset import Annotation, Dataset, Document class TestAnnotation: @@ -31,19 +31,67 @@ def test_document_nervaluate(self): class TestDataset: - def test_from_medcattrainer(self): + def test_dataset_from_medcattrainer(self): with open("tests/data/medcattrainer_export.json", "rb") as f: mctrainer_data = json.load(f) - # TODO needs more specific tests, when more functionality is there - assert Dataset.from_medcattrainer(data=mctrainer_data) + dataset = Dataset.from_medcattrainer(data=mctrainer_data) - def test_from_clinlp(self): + assert len(dataset.docs) == 2 + assert dataset.docs[0].text == "random text sample" + assert len(dataset.docs[0].annotations) == 1 + assert len(dataset.docs[1].annotations) == 3 + + assert dataset.docs[0].annotations[0].text == "anemie" + assert dataset.docs[0].annotations[0].start == 978 + assert dataset.docs[0].annotations[0].end == 984 + assert dataset.docs[0].annotations[0].label == "C0002871_anemie" + + assert dataset.docs[1].annotations[0].text == "<< p3" + assert dataset.docs[1].annotations[0].start == 1739 + assert dataset.docs[1].annotations[0].end == 1744 + assert ( + dataset.docs[1].annotations[0].label + == "C0015934_intrauterine_groeivertraging" + ) + + assert dataset.docs[0].annotations[0].qualifiers == [ + {"name": "Plausibility", "value": "Plausible"}, + {"name": "Temporality", "value": "Current"}, + {"name": "Negation", "value": "Negated"}, + {"name": "Experiencer", "value": "Patient"}, + ] + + def test_dataset_from_clinlp(self): with open("tests/data/clinlp_docs.pickle", "rb") as f: clinlp_docs = pickle.load(f) - # TODO needs more specific tests, when more functionality is there - assert Dataset.from_clinlp_docs(nlp_docs=clinlp_docs) + dataset = Dataset.from_clinlp_docs(nlp_docs=clinlp_docs) + + assert len(dataset.docs) == 3 + assert dataset.docs[0].text == "patient had geen anemie" + assert len(dataset.docs[0].annotations) == 1 + assert len(dataset.docs[1].annotations) == 2 + assert len(dataset.docs[2].annotations) == 1 + + assert dataset.docs[0].annotations[0].text == "anemie" + assert dataset.docs[0].annotations[0].start == 17 + assert dataset.docs[0].annotations[0].end == 23 + assert dataset.docs[0].annotations[0].label == "C0002871_anemie" + + assert dataset.docs[1].annotations[0].text == "prematuriteit" + assert dataset.docs[1].annotations[0].start == 18 + assert dataset.docs[1].annotations[0].end == 31 + assert dataset.docs[1].annotations[0].label == "C0151526_prematuriteit" + + assert sorted( + dataset.docs[0].annotations[0].qualifiers, key=lambda q: q["name"] + ) == [ + {"name": "Experiencer", "value": "Patient", "is_default": True}, + {"name": "Negation", "value": "Negated", "is_default": False}, + {"name": "Plausibility", "value": "Plausible", "is_default": True}, + {"name": "Temporality", "value": "Current", "is_default": True}, + ] def test_dataset_nervaluate(self): dataset = Dataset( @@ -52,7 +100,13 @@ def test_dataset_nervaluate(self): identifier="1", text="test1", annotations=[ - Annotation(text="test1", start=0, end=5, label="test1"), + Annotation( + text="test1", + start=0, + end=5, + label="test1", + qualifiers=[{"name": "Negation", "value": "Negated"}], + ), ], ), Document(