From b572b5b1e200eab4714d85592f2f519ede284bf7 Mon Sep 17 00:00:00 2001 From: fcossio Date: Wed, 17 Aug 2022 15:24:22 +0200 Subject: [PATCH 1/5] allow datasets to be parsed even if they dont come from a file directly --- deid/dicom/fields.py | 7 ++++--- deid/dicom/parser.py | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/deid/dicom/fields.py b/deid/dicom/fields.py index e2d7af83..58737280 100644 --- a/deid/dicom/fields.py +++ b/deid/dicom/fields.py @@ -237,7 +237,7 @@ def expand_field_expression(field, dicom, contenders=None): return fields -def get_fields(dicom, skip=None, expand_sequences=True, seen=None): +def get_fields(dicom, skip=None, expand_sequences=True, seen=None, from_file=True): """Expand all dicom fields into a list. Each entry is a DicomField. If we find a sequence, we unwrap it and @@ -250,8 +250,9 @@ def get_fields(dicom, skip=None, expand_sequences=True, seen=None): if not isinstance(skip, list): skip = [skip] - # Retrieve both dicom and file meta fields - datasets = [dicom, dicom.file_meta] + # Retrieve both dicom and file meta fields if dicom came from a file + datasets = [dicom, dicom.file_meta] if from_file else [dicom] + def add_element(element, name, uid, is_filemeta): """ diff --git a/deid/dicom/parser.py b/deid/dicom/parser.py index f041ad5d..38db2789 100644 --- a/deid/dicom/parser.py +++ b/deid/dicom/parser.py @@ -57,8 +57,18 @@ class DicomParser: """ def __init__( - self, dicom_file, recipe=None, config=None, force=True, disable_skip=False + self, dicom_file, recipe=None, config=None, force=True, disable_skip=False, from_file=True ): + """Create new instance of DicomParser + + :param dicom_file: Path to a dicom file or instance of a pydicom.Dataset + :param recipe: a deid recipe, defaults to None + :param config: deid config, defaults to None + :param force: ignore errors when reading a dicom file, defaults to True + :param disable_skip: _description_, defaults to False + :param from_file: the dicom_file comes from an actual file, defaults to True. + If set to False, some operations that extract file details are skipped. + """ # Lookup for the dicom self.lookup = {} @@ -81,6 +91,8 @@ def __init__( # Deid can be a recipe or filename if not isinstance(recipe, DeidRecipe): recipe = DeidRecipe(recipe) + + self.from_file = from_file self.load(dicom_file, force=force) self.recipe = recipe @@ -111,8 +123,9 @@ def load(self, dicom_file, force=True): self.dicom = read_file(dicom_file, force=force) # Set class variables that might be helpful later - self.dicom_file = os.path.abspath(self.dicom.filename) - self.dicom_name = os.path.basename(self.dicom_file) + if self.from_file: + self.dicom_file = os.path.abspath(self.dicom.filename) + self.dicom_name = os.path.basename(self.dicom_file) def define(self, name, value): """ @@ -302,6 +315,7 @@ def get_fields(self, expand_sequences=True): expand_sequences=expand_sequences, seen=self.seen, skip=self.skip, + from_file=self.from_file ) return self.fields From 6451e1611beb9c0545dff1b2947532857e1adbb6 Mon Sep 17 00:00:00 2001 From: fcossio Date: Wed, 17 Aug 2022 17:14:04 +0200 Subject: [PATCH 2/5] example for custom parsing of dataset instances --- docs/_docs/examples/deid-dataset.md | 254 ++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 docs/_docs/examples/deid-dataset.md diff --git a/docs/_docs/examples/deid-dataset.md b/docs/_docs/examples/deid-dataset.md new file mode 100644 index 00000000..a8ee20bb --- /dev/null +++ b/docs/_docs/examples/deid-dataset.md @@ -0,0 +1,254 @@ +--- +title: Deidentify a Pydicom Dataset +category: Examples +order: 3 +--- + +In this example we will create a custom class to deidentify a single instance of a `pydicom.Dataset` with a custom recipe. + + +### Overview +We will use four files for this example: +``` +my_deid_example +├── my_deid_recipe.dicom +├── my_dicom_file.json +├── my_module.py +└── requirements.txt +``` + +The `requirements.txt` file is used only to be able to run this example. +``` +deid +pydicom +pycryptodome +``` + +We can install them by running the following commands (requires `conda`) +```bash +conda create -n deid_example python=3.9 +conda activate deid_example +cd my_deid_example +pip install -r requirements.txt +``` + + +The contents of `my_dicom_file.json` are used to load a pydicom.Dataset instance. +```json +{ + "SpecificCharacterSet":{"vr":"CS","Value":["ISO_IR 100"]}, + "ImageType":{"vr":"CS","Value":["DERIVED","PRIMARY"]}, + "SOPClassUID":{"vr":"UI","Value":["1.2.840.10008.5.1.4.1.1.1.2"]}, + "StudyDate":{"vr":"DA","Value":["20220627"]}, + "SeriesDate":{"vr":"DA","Value":["20220627"]}, + "AcquisitionDate":{"vr":"DA","Value":["20220627"]}, + "ContentDate":{"vr":"DA","Value":["20220627"]}, + "StudyTime":{"vr":"TM","Value":["080803"]}, + "ContentTime":{"vr":"TM","Value":["080808.202000"]}, + "PatientName":{"vr":"PN","Value":[{"Alphabetic":"Maria^Doe"}]}, + "PatientID":{"vr":"LO","Value":["1234567890"]}, + "PatientBirthDate":{"vr":"DA","Value":["19900606"]}, + "Modality":{"vr":"CS","Value":["MG"]}, + "PatientSex":{"vr":"CS","Value":["F"]}, + "PatientAge":{"vr":"AS","Value":["032Y"]}, + "StudyID":{"vr":"SH","Value":["mammogram87654"]} +} +``` + + +### The recipe + +We create a custom recipe `my_deid_recipe.dicom` that specifies what we want to do. +``` +FORMAT dicom + +%header + +ADD PatientIdentityRemoved Yes +ADD DeidentificationMethod my_deid_recipe.dicom.v1.0 + +# Specify what we want to keep + +KEEP ContentDate +KEEP StudyDate + +# Replacements with custom functions. Those are registered in my_module.py + +REPLACE PatientName func:replace_name +REPLACE AccessionNumber func:hash_func +REPLACE AdmissionID func:hash_func +REPLACE InterpretationID func:hash_func +REPLACE PatientBirthDate func:remove_day +REPLACE PatientID func:hash_func +REPLACE PerformedProcedureStepID func:hash_func +REPLACE PerformingPhysicianName func:hash_func +REPLACE RequestedProcedureID func:hash_func +REPLACE ResultsID func:hash_func +REPLACE StudyID func:hash_func + + +# Tags that require custom regex expressions +# Curve Data"(50xx,xxxx)" +REMOVE contains:^50.{6}$ +# Overlay comments and data (60xx[34]000) +REMOVE contains:^60.{2}[34]000$ +# Private tags ggggeeee where gggg is odd +REMOVE contains:^.{3}[13579].{4}$ + +# Blank the other tags + +BLANK PatientWeight +BLANK PatientSize +REMOVE PatientAge +REMOVE SeriesDate +REMOVE AcquisitionDate +REMOVE StudyTime +REMOVE ContentTime +REMOVE PatientAge +REMOVE PatientSex + +# ... etc +``` + + +### The custom deidentifier class + +```python +from deid.config import DeidRecipe +from deid.dicom.parser import DicomParser +import pydicom +from Crypto.Hash import SHA512 +from datetime import datetime + +class DeidDataset: + """This class allows to pseudonymize an instance of + pydicom.Dataset with our custom recipe and functions. + """ + def __init__(self, secret_salt: str, recipe_path: str): + """New instance of our pseudonymizer class. + + :param secret_salt: a random string that makes the + hashing harder to break. + :param recipe_path: path to our deid recipe. + """ + self.secret_salt = secret_salt + self.recipe = DeidRecipe(recipe_path) + + def pseudonymize(self, dataset:pydicom.Dataset) -> pydicom.Dataset: + """Pseudonymize a single dicom dataset + + :param dataset: dataset that will be pseudonymized + :returns: pseudonymized dataset + """ + parser = DicomParser(dataset, self.recipe, from_file=False) + # register functions that are specified in the recipe + parser.define('replace_name', self.replace_name) + parser.define('hash_func', self.deid_hash_func) + parser.define('remove_day', self.remove_day) + # parse the dataset and apply the deidentification + parser.parse(strip_sequences=True, remove_private=True) + return parser.dicom + + # All registered functions that are used in the recipe must + # receive the arguments: `item`, `value`, `field`, `dicom` + + def deid_hash_func(self, item, value, field, dicom) -> str: + """Performs self.hash to field.element.value""" + val = field.element.value + return self.hash(str(val)) + + @staticmethod + def remove_day(item, value, field, dicom) -> str: + """Removes the day from a DT field in the deid framework""" + dt = datetime.strptime(field.element.value, '%Y%m%d') + return dt.strftime("%Y%m01") + + @staticmethod + def replace_name(item, value, field, dicom) -> str: + """Replace PatientName with PatientSex and coarse PatientAge""" + sex = dicom.get('PatientSex') + sex = {"F":'Female', "M": 'Male', 'O':'Other'}[sex] + age = DeidDataset.round_to_nearest(int(dicom.get('PatientAge')[:-1]), 5) + return f"{sex} {age:03d}Y {dicom.get('Modality')}" + + # Helper methods for our registered ones + @staticmethod + def round_to_nearest(value, interval): + """Rounds value to closest multiple of interval""" + return interval * round(value/interval) + + def hash(self, msg: str) -> str: + """ + :param msg: message that we want to encrypt, + normally the PatientID or the StudyID. + :return: the encrypted message as hexdigest + (in characters from '0' to '9' and 'a' to 'f') + """ + assert type(msg) == str, f"value is not of type str, {type(msg)}" + h = SHA512.new(truncate="256") + bytes_str = bytes(f"{self.secret_salt}{msg}", "utf-8") + h.update(bytes_str) + return str(h.hexdigest()) + +# Load the pydicom Dataset +import json + +# Unorthodox way of loading a pydicom.Dataset +# please see pydicom documentation for more information +# on how to load dicom files +with open('my_dicom_file.json') as f: + dataset_dict = json.load(f) +dataset = pydicom.Dataset.from_json(dataset_dict) + +print('Dataset before pseudonymization') +print(dataset) + +#create an instance of our class +deid_ds = DeidDataset("!2#4%6&7abc", 'my_deid_recipe.dicom') + +#pseudonymize the dataset +print('\nDataset after pseudonymization') +pseudonymized = deid_ds.pseudonymize(dataset) +print(pseudonymized) +``` + +If we execute our python module + +```bash +python my_module.py +``` + +It will give us the following output: +``` +Dataset before pseudonymization +(0008, 0005) Specific Character Set CS: 'ISO_IR 100' +(0008, 0008) Image Type CS: ['DERIVED', 'PRIMARY'] +(0008, 0016) SOP Class UID UI: Digital Mammography X-Ray Image Storage - For Presentation +(0008, 0020) Study Date DA: '20220627' +(0008, 0021) Series Date DA: '20220627' +(0008, 0022) Acquisition Date DA: '20220627' +(0008, 0023) Content Date DA: '20220627' +(0008, 0030) Study Time TM: '080803' +(0008, 0033) Content Time TM: '080808.202000' +(0008, 0060) Modality CS: 'MG' +(0010, 0010) Patient's Name PN: 'Maria^Doe' +(0010, 0020) Patient ID LO: '1234567890' +(0010, 0030) Patient's Birth Date DA: '19900606' +(0010, 0040) Patient's Sex CS: 'F' +(0010, 1010) Patient's Age AS: '032Y' +(0020, 0010) Study ID SH: 'mammogram87654' + +Dataset after pseudonymization +(0008, 0005) Specific Character Set CS: 'ISO_IR 100' +(0008, 0008) Image Type CS: ['DERIVED', 'PRIMARY'] +(0008, 0016) SOP Class UID UI: Digital Mammography X-Ray Image Storage - For Presentation +(0008, 0020) Study Date DA: '20220627' +(0008, 0023) Content Date DA: '20220627' +(0008, 0060) Modality CS: 'MG' +(0010, 0010) Patient's Name PN: 'Female 030Y MG' +(0010, 0020) Patient ID LO: 'df65775690879c36437ae950c52d025102a1f9b8c8132f8b017f14e9ec45eacb' +(0010, 0030) Patient's Birth Date DA: '19900601' +(0012, 0062) Patient Identity Removed CS: 'Yes' +(0012, 0063) De-identification Method LO: 'my_deid_recipe.dicom.v1.0' +(0020, 0010) Study ID SH: 'ae4b477e5709d0c1f746e0adc9ab552fee100b91416f9f3a04037e999077e823' +``` \ No newline at end of file From 53da9c673f0d24de386de3124d1ea787ed50eb6d Mon Sep 17 00:00:00 2001 From: fcossio Date: Fri, 26 Aug 2022 17:03:58 +0200 Subject: [PATCH 3/5] automatic detection of "from_file" --- deid/dicom/fields.py | 4 ++-- deid/dicom/parser.py | 12 ++++-------- docs/_docs/examples/deid-dataset.md | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/deid/dicom/fields.py b/deid/dicom/fields.py index 58737280..1351da8e 100644 --- a/deid/dicom/fields.py +++ b/deid/dicom/fields.py @@ -237,7 +237,7 @@ def expand_field_expression(field, dicom, contenders=None): return fields -def get_fields(dicom, skip=None, expand_sequences=True, seen=None, from_file=True): +def get_fields(dicom, skip=None, expand_sequences=True, seen=None): """Expand all dicom fields into a list. Each entry is a DicomField. If we find a sequence, we unwrap it and @@ -251,7 +251,7 @@ def get_fields(dicom, skip=None, expand_sequences=True, seen=None, from_file=Tru skip = [skip] # Retrieve both dicom and file meta fields if dicom came from a file - datasets = [dicom, dicom.file_meta] if from_file else [dicom] + datasets = [d for d in [dicom, dicom.get('file_meta', None)] if d is not None] def add_element(element, name, uid, is_filemeta): diff --git a/deid/dicom/parser.py b/deid/dicom/parser.py index 38db2789..23c7a7ae 100644 --- a/deid/dicom/parser.py +++ b/deid/dicom/parser.py @@ -57,8 +57,7 @@ class DicomParser: """ def __init__( - self, dicom_file, recipe=None, config=None, force=True, disable_skip=False, from_file=True - ): + self, dicom_file, recipe=None, config=None, force=True, disable_skip=False): """Create new instance of DicomParser :param dicom_file: Path to a dicom file or instance of a pydicom.Dataset @@ -92,7 +91,6 @@ def __init__( if not isinstance(recipe, DeidRecipe): recipe = DeidRecipe(recipe) - self.from_file = from_file self.load(dicom_file, force=force) self.recipe = recipe @@ -116,15 +114,14 @@ def load(self, dicom_file, force=True): if isinstance(dicom_file, Dataset): self.dicom = dicom_file else: - # If we must read the file, the path must exist if not os.path.exists(dicom_file): bot.exit("%s does not exist." % dicom_file) self.dicom = read_file(dicom_file, force=force) # Set class variables that might be helpful later - if self.from_file: - self.dicom_file = os.path.abspath(self.dicom.filename) + if self.dicom.get('filename', None) is not None: + self.dicom_file = self.dicom.filename self.dicom_name = os.path.basename(self.dicom_file) def define(self, name, value): @@ -314,8 +311,7 @@ def get_fields(self, expand_sequences=True): dicom=self.dicom, expand_sequences=expand_sequences, seen=self.seen, - skip=self.skip, - from_file=self.from_file + skip=self.skip ) return self.fields diff --git a/docs/_docs/examples/deid-dataset.md b/docs/_docs/examples/deid-dataset.md index a8ee20bb..00700739 100644 --- a/docs/_docs/examples/deid-dataset.md +++ b/docs/_docs/examples/deid-dataset.md @@ -140,7 +140,7 @@ class DeidDataset: :param dataset: dataset that will be pseudonymized :returns: pseudonymized dataset """ - parser = DicomParser(dataset, self.recipe, from_file=False) + parser = DicomParser(dataset, self.recipe) # register functions that are specified in the recipe parser.define('replace_name', self.replace_name) parser.define('hash_func', self.deid_hash_func) From 957c0428260c023f3de0f912947a5c9528c4869e Mon Sep 17 00:00:00 2001 From: fcossio Date: Fri, 26 Aug 2022 22:44:15 +0200 Subject: [PATCH 4/5] updated formatting with black updated the docstring remove extra "None"s --- deid/dicom/fields.py | 3 +-- deid/dicom/parser.py | 12 ++++++------ docs/_docs/examples/deid-dataset.md | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/deid/dicom/fields.py b/deid/dicom/fields.py index 1351da8e..4d1a78d6 100644 --- a/deid/dicom/fields.py +++ b/deid/dicom/fields.py @@ -251,8 +251,7 @@ def get_fields(dicom, skip=None, expand_sequences=True, seen=None): skip = [skip] # Retrieve both dicom and file meta fields if dicom came from a file - datasets = [d for d in [dicom, dicom.get('file_meta', None)] if d is not None] - + datasets = [d for d in [dicom, dicom.get("file_meta")] if d] def add_element(element, name, uid, is_filemeta): """ diff --git a/deid/dicom/parser.py b/deid/dicom/parser.py index 23c7a7ae..e4a07d8a 100644 --- a/deid/dicom/parser.py +++ b/deid/dicom/parser.py @@ -57,16 +57,16 @@ class DicomParser: """ def __init__( - self, dicom_file, recipe=None, config=None, force=True, disable_skip=False): - """Create new instance of DicomParser + self, dicom_file, recipe=None, config=None, force=True, disable_skip=False + ): + """ + Create new instance of DicomParser :param dicom_file: Path to a dicom file or instance of a pydicom.Dataset :param recipe: a deid recipe, defaults to None :param config: deid config, defaults to None :param force: ignore errors when reading a dicom file, defaults to True :param disable_skip: _description_, defaults to False - :param from_file: the dicom_file comes from an actual file, defaults to True. - If set to False, some operations that extract file details are skipped. """ # Lookup for the dicom @@ -120,7 +120,7 @@ def load(self, dicom_file, force=True): self.dicom = read_file(dicom_file, force=force) # Set class variables that might be helpful later - if self.dicom.get('filename', None) is not None: + if self.dicom.get("filename", None) is not None: self.dicom_file = self.dicom.filename self.dicom_name = os.path.basename(self.dicom_file) @@ -311,7 +311,7 @@ def get_fields(self, expand_sequences=True): dicom=self.dicom, expand_sequences=expand_sequences, seen=self.seen, - skip=self.skip + skip=self.skip, ) return self.fields diff --git a/docs/_docs/examples/deid-dataset.md b/docs/_docs/examples/deid-dataset.md index 00700739..6e11090d 100644 --- a/docs/_docs/examples/deid-dataset.md +++ b/docs/_docs/examples/deid-dataset.md @@ -1,7 +1,7 @@ --- title: Deidentify a Pydicom Dataset category: Examples -order: 3 +order: 6 --- In this example we will create a custom class to deidentify a single instance of a `pydicom.Dataset` with a custom recipe. From ebec3be08a2dbfcebb9e13798fab1fb2d16c56a3 Mon Sep 17 00:00:00 2001 From: fcossio Date: Fri, 26 Aug 2022 23:04:35 +0200 Subject: [PATCH 5/5] added default value for dicom_file --- deid/dicom/parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deid/dicom/parser.py b/deid/dicom/parser.py index e4a07d8a..2b93c6af 100644 --- a/deid/dicom/parser.py +++ b/deid/dicom/parser.py @@ -120,9 +120,9 @@ def load(self, dicom_file, force=True): self.dicom = read_file(dicom_file, force=force) # Set class variables that might be helpful later - if self.dicom.get("filename", None) is not None: - self.dicom_file = self.dicom.filename - self.dicom_name = os.path.basename(self.dicom_file) + df = self.dicom.get("filename") + self.dicom_file = None if not df else os.path.abspath(df) + self.dicom_name = None if not df else os.path.basename(self.dicom_file) def define(self, name, value): """