Merge pull request #2 from umcu/add-qualifier-info

Parse qualifier info
umcu · Oct 3, 2023 · 220cd7d · 220cd7d
2 parents d3daced + ec7d412
commit 220cd7d
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,12 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 0.1.2 (2023-10-03)
+
+### Added
+
+- Parsing qualifier/meta annotation info
+
 ## 0.1.1 (2023-09-27)
 
 ### Added
 - Classes for `Annotation`, `Document` and `Dataset`
 - Parsing of `clinlp` output
-- Parsing of `mecattrainer` output
+- Parsing of `medcattrainer` output
 
 ## 0.1.0 (2023-08-14)
 

diff --git a/README.md b/README.md
@@ -15,22 +15,23 @@ pip install -e .
 A small example to get started:
 
 ```python
-from clin_nlp_metrics.metrics import Dataset
+from clin_nlp_metrics.dataset import Dataset
 import json
 
 # medcattrainer
 import json
 
 with open('medcattrainer_export.json', 'rb') as f:
     mtrainer_data = json.load(f)
-    
+
 d1 = Dataset.from_medcattrainer(mctrainer_data)
 
 # clinlp
 import clinlp
 import spacy
 
 from model import get_model  # not included
+
 nlp = get_model()
 nlp_docs = nlp.pipe([doc['text'] for doc in data['projects'][0]['documents']])
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "clin-nlp-metrics"
-version = "0.1.1"
+version = "0.1.2"
 authors = [
   { name="Richard Bartels", email="r.t.bartels-6@umcutrecht.nl" },
 ]
@@ -15,7 +15,7 @@ classifiers = [
     "Programming Language :: Python :: 3",
     "Operating System :: OS Independent",
 ]
-dependencies = ["spacy~=3.0", "clinlp~=0.5"]
+dependencies = ["spacy~=3.0", "clinlp>=0.6"]
 
 [project.optional-dependencies]
 test = ["pytest", "pytest-cov"]

diff --git a/src/clin_nlp_metrics/__init__.py b/src/clin_nlp_metrics/__init__.py
@@ -0,0 +1,3 @@
+from .dataset import Dataset
+
+__all__ = ["Dataset"]
diff --git a/src/clin_nlp_metrics/metrics.py → src/clin_nlp_metrics/dataset.py b/src/clin_nlp_metrics/metrics.py → src/clin_nlp_metrics/dataset.py
@@ -23,7 +23,7 @@ class Annotation:
     label: str
     """ The label/tag"""
 
-    qualifiers: Optional[list[str]] = None
+    qualifiers: Optional[list[dict]] = None
     """ Optionally, a list of qualifiers"""
 
     def to_nervaluate(self) -> dict:
@@ -33,7 +33,6 @@ def to_nervaluate(self) -> dict:
         Returns
         -------
         A dictionary with the items nervaluate expects.
-
         """
         return {"start": self.start, "end": self.end, "label": self.label}
 
@@ -96,12 +95,24 @@ def from_clinlp_docs(
             annotations = []
 
             for ent in doc.ents:
+                qualifiers = []
+
+                for qualifier in ent._.qualifiers_dict:
+                    qualifiers.append(
+                        {
+                            "name": qualifier["name"].title(),
+                            "value": qualifier["value"].title(),
+                            "is_default": qualifier["is_default"],
+                        }
+                    )
+
                 annotations.append(
                     Annotation(
                         text=str(ent),
                         start=ent.start_char,
                         end=ent.end_char,
                         label=ent.label_,
+                        qualifiers=qualifiers,
                     )
                 )
 
@@ -142,12 +153,23 @@ def from_medcattrainer(data: dict) -> "Dataset":
 
             for annotation in doc["annotations"]:
                 if not annotation["deleted"]:
+                    qualifiers = []
+
+                    for qualifier in annotation["meta_anns"].values():
+                        qualifiers.append(
+                            {
+                                "name": qualifier["name"].title(),
+                                "value": qualifier["value"].title(),
+                            }
+                        )
+
                     annotations.append(
                         Annotation(
                             text=annotation["value"],
                             start=annotation["start"],
                             end=annotation["end"],
                             label=annotation["cui"],
+                            qualifiers=qualifiers,
                         )
                     )
 

diff --git a/tests/data/clinlp_docs.pickle b/tests/data/clinlp_docs.pickle
diff --git a/tests/data/medcattrainer_export.json b/tests/data/medcattrainer_export.json
@@ -20,7 +20,7 @@
               "end": 1068,
               "validated": true,
               "correct": true,
-              "deleted": false,
+              "deleted": true,
               "alternative": false,
               "killed": false,
               "irrelevant": false,
@@ -87,7 +87,7 @@
                 },
                 "Negation": {
                   "name": "Negation",
-                  "value": "Affirmed",
+                  "value": "NEGATED",
                   "acc": 1.0,
                   "validated": true
                 },

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -3,7 +3,7 @@
 
 import clinlp  # noqa: F401
 
-from clin_nlp_metrics.metrics import Annotation, Dataset, Document
+from clin_nlp_metrics.dataset import Annotation, Dataset, Document
 
 
 class TestAnnotation:
@@ -31,19 +31,67 @@ def test_document_nervaluate(self):
 
 
 class TestDataset:
-    def test_from_medcattrainer(self):
+    def test_dataset_from_medcattrainer(self):
         with open("tests/data/medcattrainer_export.json", "rb") as f:
             mctrainer_data = json.load(f)
 
-        # TODO needs more specific tests, when more functionality is there
-        assert Dataset.from_medcattrainer(data=mctrainer_data)
+        dataset = Dataset.from_medcattrainer(data=mctrainer_data)
 
-    def test_from_clinlp(self):
+        assert len(dataset.docs) == 2
+        assert dataset.docs[0].text == "random text sample"
+        assert len(dataset.docs[0].annotations) == 1
+        assert len(dataset.docs[1].annotations) == 3
+
+        assert dataset.docs[0].annotations[0].text == "anemie"
+        assert dataset.docs[0].annotations[0].start == 978
+        assert dataset.docs[0].annotations[0].end == 984
+        assert dataset.docs[0].annotations[0].label == "C0002871_anemie"
+
+        assert dataset.docs[1].annotations[0].text == "<< p3"
+        assert dataset.docs[1].annotations[0].start == 1739
+        assert dataset.docs[1].annotations[0].end == 1744
+        assert (
+            dataset.docs[1].annotations[0].label
+            == "C0015934_intrauterine_groeivertraging"
+        )
+
+        assert dataset.docs[0].annotations[0].qualifiers == [
+            {"name": "Plausibility", "value": "Plausible"},
+            {"name": "Temporality", "value": "Current"},
+            {"name": "Negation", "value": "Negated"},
+            {"name": "Experiencer", "value": "Patient"},
+        ]
+
+    def test_dataset_from_clinlp(self):
         with open("tests/data/clinlp_docs.pickle", "rb") as f:
             clinlp_docs = pickle.load(f)
 
-        # TODO needs more specific tests, when more functionality is there
-        assert Dataset.from_clinlp_docs(nlp_docs=clinlp_docs)
+        dataset = Dataset.from_clinlp_docs(nlp_docs=clinlp_docs)
+
+        assert len(dataset.docs) == 3
+        assert dataset.docs[0].text == "patient had geen anemie"
+        assert len(dataset.docs[0].annotations) == 1
+        assert len(dataset.docs[1].annotations) == 2
+        assert len(dataset.docs[2].annotations) == 1
+
+        assert dataset.docs[0].annotations[0].text == "anemie"
+        assert dataset.docs[0].annotations[0].start == 17
+        assert dataset.docs[0].annotations[0].end == 23
+        assert dataset.docs[0].annotations[0].label == "C0002871_anemie"
+
+        assert dataset.docs[1].annotations[0].text == "prematuriteit"
+        assert dataset.docs[1].annotations[0].start == 18
+        assert dataset.docs[1].annotations[0].end == 31
+        assert dataset.docs[1].annotations[0].label == "C0151526_prematuriteit"
+
+        assert sorted(
+            dataset.docs[0].annotations[0].qualifiers, key=lambda q: q["name"]
+        ) == [
+            {"name": "Experiencer", "value": "Patient", "is_default": True},
+            {"name": "Negation", "value": "Negated", "is_default": False},
+            {"name": "Plausibility", "value": "Plausible", "is_default": True},
+            {"name": "Temporality", "value": "Current", "is_default": True},
+        ]
 
     def test_dataset_nervaluate(self):
         dataset = Dataset(
@@ -52,7 +100,13 @@ def test_dataset_nervaluate(self):
                     identifier="1",
                     text="test1",
                     annotations=[
-                        Annotation(text="test1", start=0, end=5, label="test1"),
+                        Annotation(
+                            text="test1",
+                            start=0,
+                            end=5,
+                            label="test1",
+                            qualifiers=[{"name": "Negation", "value": "Negated"}],
+                        ),
                     ],
                 ),
                 Document(