Skip to content
This repository has been archived by the owner on May 13, 2024. It is now read-only.

Commit

Permalink
Merge pull request #2 from umcu/add-qualifier-info
Browse files Browse the repository at this point in the history
Parse qualifier info
  • Loading branch information
vmenger authored Oct 3, 2023
2 parents d3daced + ec7d412 commit 220cd7d
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 17 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 0.1.2 (2023-10-03)

### Added

- Parsing qualifier/meta annotation info

## 0.1.1 (2023-09-27)

### Added
- Classes for `Annotation`, `Document` and `Dataset`
- Parsing of `clinlp` output
- Parsing of `mecattrainer` output
- Parsing of `medcattrainer` output

## 0.1.0 (2023-08-14)

Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,23 @@ pip install -e .
A small example to get started:

```python
from clin_nlp_metrics.metrics import Dataset
from clin_nlp_metrics.dataset import Dataset
import json

# medcattrainer
import json

with open('medcattrainer_export.json', 'rb') as f:
mtrainer_data = json.load(f)

d1 = Dataset.from_medcattrainer(mctrainer_data)

# clinlp
import clinlp
import spacy

from model import get_model # not included

nlp = get_model()
nlp_docs = nlp.pipe([doc['text'] for doc in data['projects'][0]['documents']])

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "clin-nlp-metrics"
version = "0.1.1"
version = "0.1.2"
authors = [
{ name="Richard Bartels", email="r.t.bartels-6@umcutrecht.nl" },
]
Expand All @@ -15,7 +15,7 @@ classifiers = [
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
]
dependencies = ["spacy~=3.0", "clinlp~=0.5"]
dependencies = ["spacy~=3.0", "clinlp>=0.6"]

[project.optional-dependencies]
test = ["pytest", "pytest-cov"]
Expand Down
3 changes: 3 additions & 0 deletions src/clin_nlp_metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .dataset import Dataset

__all__ = ["Dataset"]
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Annotation:
label: str
""" The label/tag"""

qualifiers: Optional[list[str]] = None
qualifiers: Optional[list[dict]] = None
""" Optionally, a list of qualifiers"""

def to_nervaluate(self) -> dict:
Expand All @@ -33,7 +33,6 @@ def to_nervaluate(self) -> dict:
Returns
-------
A dictionary with the items nervaluate expects.
"""
return {"start": self.start, "end": self.end, "label": self.label}

Expand Down Expand Up @@ -96,12 +95,24 @@ def from_clinlp_docs(
annotations = []

for ent in doc.ents:
qualifiers = []

for qualifier in ent._.qualifiers_dict:
qualifiers.append(
{
"name": qualifier["name"].title(),
"value": qualifier["value"].title(),
"is_default": qualifier["is_default"],
}
)

annotations.append(
Annotation(
text=str(ent),
start=ent.start_char,
end=ent.end_char,
label=ent.label_,
qualifiers=qualifiers,
)
)

Expand Down Expand Up @@ -142,12 +153,23 @@ def from_medcattrainer(data: dict) -> "Dataset":

for annotation in doc["annotations"]:
if not annotation["deleted"]:
qualifiers = []

for qualifier in annotation["meta_anns"].values():
qualifiers.append(
{
"name": qualifier["name"].title(),
"value": qualifier["value"].title(),
}
)

annotations.append(
Annotation(
text=annotation["value"],
start=annotation["start"],
end=annotation["end"],
label=annotation["cui"],
qualifiers=qualifiers,
)
)

Expand Down
Binary file modified tests/data/clinlp_docs.pickle
Binary file not shown.
4 changes: 2 additions & 2 deletions tests/data/medcattrainer_export.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"end": 1068,
"validated": true,
"correct": true,
"deleted": false,
"deleted": true,
"alternative": false,
"killed": false,
"irrelevant": false,
Expand Down Expand Up @@ -87,7 +87,7 @@
},
"Negation": {
"name": "Negation",
"value": "Affirmed",
"value": "NEGATED",
"acc": 1.0,
"validated": true
},
Expand Down
70 changes: 62 additions & 8 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import clinlp # noqa: F401

from clin_nlp_metrics.metrics import Annotation, Dataset, Document
from clin_nlp_metrics.dataset import Annotation, Dataset, Document


class TestAnnotation:
Expand Down Expand Up @@ -31,19 +31,67 @@ def test_document_nervaluate(self):


class TestDataset:
def test_from_medcattrainer(self):
def test_dataset_from_medcattrainer(self):
with open("tests/data/medcattrainer_export.json", "rb") as f:
mctrainer_data = json.load(f)

# TODO needs more specific tests, when more functionality is there
assert Dataset.from_medcattrainer(data=mctrainer_data)
dataset = Dataset.from_medcattrainer(data=mctrainer_data)

def test_from_clinlp(self):
assert len(dataset.docs) == 2
assert dataset.docs[0].text == "random text sample"
assert len(dataset.docs[0].annotations) == 1
assert len(dataset.docs[1].annotations) == 3

assert dataset.docs[0].annotations[0].text == "anemie"
assert dataset.docs[0].annotations[0].start == 978
assert dataset.docs[0].annotations[0].end == 984
assert dataset.docs[0].annotations[0].label == "C0002871_anemie"

assert dataset.docs[1].annotations[0].text == "<< p3"
assert dataset.docs[1].annotations[0].start == 1739
assert dataset.docs[1].annotations[0].end == 1744
assert (
dataset.docs[1].annotations[0].label
== "C0015934_intrauterine_groeivertraging"
)

assert dataset.docs[0].annotations[0].qualifiers == [
{"name": "Plausibility", "value": "Plausible"},
{"name": "Temporality", "value": "Current"},
{"name": "Negation", "value": "Negated"},
{"name": "Experiencer", "value": "Patient"},
]

def test_dataset_from_clinlp(self):
with open("tests/data/clinlp_docs.pickle", "rb") as f:
clinlp_docs = pickle.load(f)

# TODO needs more specific tests, when more functionality is there
assert Dataset.from_clinlp_docs(nlp_docs=clinlp_docs)
dataset = Dataset.from_clinlp_docs(nlp_docs=clinlp_docs)

assert len(dataset.docs) == 3
assert dataset.docs[0].text == "patient had geen anemie"
assert len(dataset.docs[0].annotations) == 1
assert len(dataset.docs[1].annotations) == 2
assert len(dataset.docs[2].annotations) == 1

assert dataset.docs[0].annotations[0].text == "anemie"
assert dataset.docs[0].annotations[0].start == 17
assert dataset.docs[0].annotations[0].end == 23
assert dataset.docs[0].annotations[0].label == "C0002871_anemie"

assert dataset.docs[1].annotations[0].text == "prematuriteit"
assert dataset.docs[1].annotations[0].start == 18
assert dataset.docs[1].annotations[0].end == 31
assert dataset.docs[1].annotations[0].label == "C0151526_prematuriteit"

assert sorted(
dataset.docs[0].annotations[0].qualifiers, key=lambda q: q["name"]
) == [
{"name": "Experiencer", "value": "Patient", "is_default": True},
{"name": "Negation", "value": "Negated", "is_default": False},
{"name": "Plausibility", "value": "Plausible", "is_default": True},
{"name": "Temporality", "value": "Current", "is_default": True},
]

def test_dataset_nervaluate(self):
dataset = Dataset(
Expand All @@ -52,7 +100,13 @@ def test_dataset_nervaluate(self):
identifier="1",
text="test1",
annotations=[
Annotation(text="test1", start=0, end=5, label="test1"),
Annotation(
text="test1",
start=0,
end=5,
label="test1",
qualifiers=[{"name": "Negation", "value": "Negated"}],
),
],
),
Document(
Expand Down

0 comments on commit 220cd7d

Please sign in to comment.