Merge pull request #95 from uc-cdis/feat/nested-fields

HP-1142 Feat/nested field names
uc-cdis · Jul 19, 2023 · d796552 · d796552
2 parents f2f85fc + f269adc
commit d796552
Show file tree

Hide file tree

Showing 16 changed files with 532 additions and 420 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,6 +13,6 @@ repos:
     -   id: no-commit-to-branch
         args: [--branch, develop, --branch, master, --pattern, release/.*]
 -   repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 23.3.0
     hooks:
     -   id: black
diff --git a/docs/config_agg_mds.md b/docs/config_agg_mds.md
@@ -333,6 +333,23 @@ where:
 }
 ```
 
+#### Nested Field Names
+**(New in 3.1.0)** The field mapping now supports setting up nested fields in result by using [JSON path syntax](https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html) as field names.
+For example, the following field mapping
+```json
+"study_metadata.summary": {
+      "path":"description",
+      "default" : "N/A"
+}
+```
+
+will yield to a result like this as output
+```json
+"study_metadata":{
+  "summary": "This is a summary"
+}
+```
+
 ### Per Item Overrides
 
 The configuration file also supports what is called per item overrides. This gives you the ability to override or add values to specific metadata entries after they are normalized but before they are added to the Aggregate Metadata. To override an item value, add a JSON object with the id of the item you want to override, as shown in the figure above. The JSON object should set each field that you to override. In the case the item is not present, the per item values are ignored. If the per item values are not present in the normalized fields, they are added.

diff --git a/docs/openapi.yaml b/docs/openapi.yaml
@@ -92,7 +92,7 @@ components:
       type: http
 info:
   title: Framework Services Object Management Service
-  version: 3.0.0
+  version: 3.1.0
 openapi: 3.0.2
 paths:
   /_status:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "mds"
-version = "3.0.0"
+version = "3.1.0"
 description = "Metadata Service"
 authors = ["CTDS UChicago <cdis@uchicago.edu>"]
 license = "Apache-2.0"

diff --git a/src/mds/agg_mds/adapters.py b/src/mds/agg_mds/adapters.py
@@ -187,62 +187,77 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d
         results = {}
 
         for key, value in mappings.items():
+            try:
+                jsonpath_expr = parse(key)
+            except JSONPathError as exc:
+                logger.error(
+                    f"Invalid JSON Path expression {exc} found as key . See https://github.com/json-path/JsonPath. Skipping this field"
+                )
+                continue
+            key_entries_in_schema = jsonpath_expr.find(schema)
+
             if isinstance(value, dict):  # have a complex assignment
                 expression = value.get("path", None)
 
-                hasDefaultValue = False
+                has_default_value = False
                 default_value = None
                 # get adapter's default value if set
                 if "default" in value:
-                    hasDefaultValue = True
+                    has_default_value = True
                     default_value = value["default"]
 
                 # get schema default value if set
-                if hasDefaultValue is False:
-                    if key in schema and schema[key].default is not None:
-                        hasDefaultValue = True
-                        default_value = schema[key].default
+                if has_default_value is False:
+                    if (
+                        len(key_entries_in_schema)
+                        and key_entries_in_schema[0].value.default is not None
+                    ):
+                        has_default_value = True
+                        default_value = key_entries_in_schema[0].value.default
 
                 field_value = get_json_path_value(
-                    expression, item, hasDefaultValue, default_value
+                    expression, item, has_default_value, default_value
                 )
 
                 filters = value.get("filters", [])
-                for filter in filters:
-                    field_value = FieldFilters.execute(filter, field_value)
+                for flt in filters:
+                    field_value = FieldFilters.execute(flt, field_value)
 
             elif isinstance(value, str) and "path:" in value:
                 # process as json path
                 expression = value.split("path:")[1]
 
-                hasDefaultValue = False
+                has_default_value = False
                 default_value = None
-                if key in schema:
-                    d = schema[key].default
-                    if d is not None:
-                        hasDefaultValue = True
-                        default_value = d
+                if (
+                    len(key_entries_in_schema)
+                    and key_entries_in_schema[0].value.default is not None
+                ):
+                    has_default_value = True
+                    default_value = key_entries_in_schema[0].value.default
 
                 field_value = get_json_path_value(
-                    expression, item, hasDefaultValue, default_value
+                    expression, item, has_default_value, default_value
                 )
             else:
                 field_value = value
 
-            for f in global_filters:
-                field_value = FieldFilters.execute(f, field_value)
-            if key in schema:
-                field_value = schema[key].normalize_value(field_value)
+            for gf in global_filters:
+                field_value = FieldFilters.execute(gf, field_value)
+            if len(key_entries_in_schema):
+                field_value = key_entries_in_schema[0].value.normalize_value(
+                    field_value
+                )
             # set to default if conversion failed and a default value is available
             if field_value is None:
-                if hasDefaultValue:
+                if has_default_value:
                     field_value = default_value
                 else:
                     logger.warning(
                         f"{key} = None{', is not in the schema,' if key not in schema else ''} "
                         f"and has no default value. Consider adding {key} to the schema"
                     )
-            results[key] = field_value
+            jsonpath_expr.update_or_create(results, field_value)
         return results
 
     @staticmethod
@@ -687,9 +702,9 @@ def addGen3ExpectedFields(
                 results = mapped_fields
 
         if isinstance(results.get("investigators"), list):
-            results["investigators"] = results["investigators"].join(", ")
+            results["investigators"] = ",".join(results["investigators"])
         if isinstance(results.get("investigators_name"), list):
-            results["investigators_name"] = results["investigators_name"].join(", ")
+            results["investigators_name"] = ",".join(results["investigators_name"])
         return results
 
     def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]:
@@ -1009,7 +1024,7 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]:
         results = {}
         for guid, record in data["results"].items():
             if study_field not in record:
-                logger.error(f"Study field not in record. Skipping")
+                logger.error("Study field not in record. Skipping")
                 continue
             item = Gen3Adapter.addGen3ExpectedFields(
                 record[study_field],

diff --git a/src/mds/agg_mds/commons.py b/src/mds/agg_mds/commons.py
@@ -40,14 +40,14 @@ class FieldAggregation:
 
 
 def string_to_array(s: str) -> Optional[List[str]]:
-    if s == "":
+    if not s:
         return []
     return [s]
 
 
 def array_to_string(arr: Optional[list]) -> Optional[str]:
     if arr is None:
-        logger.error(f"array is None")
+        logger.error("array is None")
         return None
     return "".join(arr)
 

diff --git a/tests/test_agg_mds_cidc_adapter.py b/tests/test_agg_mds_cidc_adapter.py
@@ -93,6 +93,10 @@ def test_get_metadata_cidc():
         "data_type": "path:supporting_data",
         "primary_site": "path:location",
         "tags": [],
+        "study_metadata.minimal_info.alternative_study_description": {
+            "path": "description",
+            "filters": ["strip_html", "prepare_cidc_description"],
+        },
     }
 
     respx.get("http://test/ok").mock(side_effect=httpx.TimeoutException)
@@ -147,6 +151,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Prostate", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "Note: This collection has special restrictions on its usage. See Data Usage Policies and Restrictions. The Cancer Imaging Program (CIP)  is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this image collection of prostate adenocarcinoma (PRAD) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome.  Please see the TCGA-PRAD wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_blca": {
@@ -175,6 +184,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Bladder", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Genome Atlas-Bladder Endothelial Carcinoma (TCGA-BLCA) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP), with the cooperation of several of the TCGA tissue-contributing institutions, has archived a large portion of the radiological images of the genetically-analyzed BLCA cases.Please see the TCGA-BLCA wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_ucec": {
@@ -203,6 +217,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Uterus", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Genome Atlas-Uterine Corpus Endometrial Carcinoma (TCGA-UCEC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the genetically-analyzed UCEC cases.Please see the TCGA-UCEC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_hnsc": {
@@ -231,6 +250,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Head-Neck", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Imaging Program (CIP)  is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this large PET/CT multi-sequence image collection of head and neck squamous cell carcinoma (HNSC) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome.  Please see the TCGA -HNSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_lusc": {
@@ -259,6 +283,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Lung", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Genome Atlas-Lung Squamous Cell Carcinoma (TCGA-LUSC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the LUSC cases.Please see the TCGA-LUSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
     }
@@ -298,6 +327,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Prostate", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "Note: This collection has special restrictions on its usage. See Data Usage Policies and Restrictions. The Cancer Imaging Program (CIP)  is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this image collection of prostate adenocarcinoma (PRAD) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome.  Please see the TCGA-PRAD wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_blca": {
@@ -332,6 +366,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Bladder", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Genome Atlas-Bladder Endothelial Carcinoma (TCGA-BLCA) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP), with the cooperation of several of the TCGA tissue-contributing institutions, has archived a large portion of the radiological images of the genetically-analyzed BLCA cases.Please see the TCGA-BLCA wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_ucec": {
@@ -366,6 +405,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Uterus", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Genome Atlas-Uterine Corpus Endometrial Carcinoma (TCGA-UCEC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the genetically-analyzed UCEC cases.Please see the TCGA-UCEC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_hnsc": {
@@ -400,6 +444,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Head-Neck", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Imaging Program (CIP)  is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this large PET/CT multi-sequence image collection of head and neck squamous cell carcinoma (HNSC) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome.  Please see the TCGA -HNSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
         "tcga_lusc": {
@@ -434,6 +483,11 @@ def test_get_metadata_cidc():
                     {"name": "Clinical, Genomics", "category": "data_type"},
                     {"name": "Lung", "category": "primary_site"},
                 ],
+                "study_metadata": {
+                    "minimal_info": {
+                        "alternative_study_description": "The Cancer Genome Atlas-Lung Squamous Cell Carcinoma (TCGA-LUSC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the LUSC cases.Please see the TCGA-LUSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
+                    }
+                },
             },
         },
     }