Skip to content

Commit

Permalink
Merge pull request #95 from uc-cdis/feat/nested-fields
Browse files Browse the repository at this point in the history
HP-1142 Feat/nested field names
  • Loading branch information
mfshao authored Jul 19, 2023
2 parents f2f85fc + f269adc commit d796552
Show file tree
Hide file tree
Showing 16 changed files with 532 additions and 420 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ repos:
- id: no-commit-to-branch
args: [--branch, develop, --branch, master, --pattern, release/.*]
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 23.3.0
hooks:
- id: black
17 changes: 17 additions & 0 deletions docs/config_agg_mds.md
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,23 @@ where:
}
```

#### Nested Field Names
**(New in 3.1.0)** The field mapping now supports setting up nested fields in result by using [JSON path syntax](https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html) as field names.
For example, the following field mapping
```json
"study_metadata.summary": {
"path":"description",
"default" : "N/A"
}
```

will yield to a result like this as output
```json
"study_metadata":{
"summary": "This is a summary"
}
```

### Per Item Overrides

The configuration file also supports what is called per item overrides. This gives you the ability to override or add values to specific metadata entries after they are normalized but before they are added to the Aggregate Metadata. To override an item value, add a JSON object with the id of the item you want to override, as shown in the figure above. The JSON object should set each field that you to override. In the case the item is not present, the per item values are ignored. If the per item values are not present in the normalized fields, they are added.
Expand Down
2 changes: 1 addition & 1 deletion docs/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ components:
type: http
info:
title: Framework Services Object Management Service
version: 3.0.0
version: 3.1.0
openapi: 3.0.2
paths:
/_status:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "mds"
version = "3.0.0"
version = "3.1.0"
description = "Metadata Service"
authors = ["CTDS UChicago <cdis@uchicago.edu>"]
license = "Apache-2.0"
Expand Down
65 changes: 40 additions & 25 deletions src/mds/agg_mds/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,62 +187,77 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d
results = {}

for key, value in mappings.items():
try:
jsonpath_expr = parse(key)
except JSONPathError as exc:
logger.error(
f"Invalid JSON Path expression {exc} found as key . See https://github.com/json-path/JsonPath. Skipping this field"
)
continue
key_entries_in_schema = jsonpath_expr.find(schema)

if isinstance(value, dict): # have a complex assignment
expression = value.get("path", None)

hasDefaultValue = False
has_default_value = False
default_value = None
# get adapter's default value if set
if "default" in value:
hasDefaultValue = True
has_default_value = True
default_value = value["default"]

# get schema default value if set
if hasDefaultValue is False:
if key in schema and schema[key].default is not None:
hasDefaultValue = True
default_value = schema[key].default
if has_default_value is False:
if (
len(key_entries_in_schema)
and key_entries_in_schema[0].value.default is not None
):
has_default_value = True
default_value = key_entries_in_schema[0].value.default

field_value = get_json_path_value(
expression, item, hasDefaultValue, default_value
expression, item, has_default_value, default_value
)

filters = value.get("filters", [])
for filter in filters:
field_value = FieldFilters.execute(filter, field_value)
for flt in filters:
field_value = FieldFilters.execute(flt, field_value)

elif isinstance(value, str) and "path:" in value:
# process as json path
expression = value.split("path:")[1]

hasDefaultValue = False
has_default_value = False
default_value = None
if key in schema:
d = schema[key].default
if d is not None:
hasDefaultValue = True
default_value = d
if (
len(key_entries_in_schema)
and key_entries_in_schema[0].value.default is not None
):
has_default_value = True
default_value = key_entries_in_schema[0].value.default

field_value = get_json_path_value(
expression, item, hasDefaultValue, default_value
expression, item, has_default_value, default_value
)
else:
field_value = value

for f in global_filters:
field_value = FieldFilters.execute(f, field_value)
if key in schema:
field_value = schema[key].normalize_value(field_value)
for gf in global_filters:
field_value = FieldFilters.execute(gf, field_value)
if len(key_entries_in_schema):
field_value = key_entries_in_schema[0].value.normalize_value(
field_value
)
# set to default if conversion failed and a default value is available
if field_value is None:
if hasDefaultValue:
if has_default_value:
field_value = default_value
else:
logger.warning(
f"{key} = None{', is not in the schema,' if key not in schema else ''} "
f"and has no default value. Consider adding {key} to the schema"
)
results[key] = field_value
jsonpath_expr.update_or_create(results, field_value)
return results

@staticmethod
Expand Down Expand Up @@ -687,9 +702,9 @@ def addGen3ExpectedFields(
results = mapped_fields

if isinstance(results.get("investigators"), list):
results["investigators"] = results["investigators"].join(", ")
results["investigators"] = ",".join(results["investigators"])
if isinstance(results.get("investigators_name"), list):
results["investigators_name"] = results["investigators_name"].join(", ")
results["investigators_name"] = ",".join(results["investigators_name"])
return results

def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]:
Expand Down Expand Up @@ -1009,7 +1024,7 @@ def normalizeToGen3MDSFields(self, data, **kwargs) -> Dict[str, Any]:
results = {}
for guid, record in data["results"].items():
if study_field not in record:
logger.error(f"Study field not in record. Skipping")
logger.error("Study field not in record. Skipping")
continue
item = Gen3Adapter.addGen3ExpectedFields(
record[study_field],
Expand Down
4 changes: 2 additions & 2 deletions src/mds/agg_mds/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,14 @@ class FieldAggregation:


def string_to_array(s: str) -> Optional[List[str]]:
if s == "":
if not s:
return []
return [s]


def array_to_string(arr: Optional[list]) -> Optional[str]:
if arr is None:
logger.error(f"array is None")
logger.error("array is None")
return None
return "".join(arr)

Expand Down
54 changes: 54 additions & 0 deletions tests/test_agg_mds_cidc_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ def test_get_metadata_cidc():
"data_type": "path:supporting_data",
"primary_site": "path:location",
"tags": [],
"study_metadata.minimal_info.alternative_study_description": {
"path": "description",
"filters": ["strip_html", "prepare_cidc_description"],
},
}

respx.get("http://test/ok").mock(side_effect=httpx.TimeoutException)
Expand Down Expand Up @@ -147,6 +151,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Prostate", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "Note: This collection has special restrictions on its usage. See Data Usage Policies and Restrictions. The Cancer Imaging Program (CIP) is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this image collection of prostate adenocarcinoma (PRAD) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome. Please see the TCGA-PRAD wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_blca": {
Expand Down Expand Up @@ -175,6 +184,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Bladder", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Genome Atlas-Bladder Endothelial Carcinoma (TCGA-BLCA) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP), with the cooperation of several of the TCGA tissue-contributing institutions, has archived a large portion of the radiological images of the genetically-analyzed BLCA cases.Please see the TCGA-BLCA wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_ucec": {
Expand Down Expand Up @@ -203,6 +217,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Uterus", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Genome Atlas-Uterine Corpus Endometrial Carcinoma (TCGA-UCEC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the genetically-analyzed UCEC cases.Please see the TCGA-UCEC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_hnsc": {
Expand Down Expand Up @@ -231,6 +250,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Head-Neck", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Imaging Program (CIP) is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this large PET/CT multi-sequence image collection of head and neck squamous cell carcinoma (HNSC) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome. Please see the TCGA -HNSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_lusc": {
Expand Down Expand Up @@ -259,6 +283,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Lung", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Genome Atlas-Lung Squamous Cell Carcinoma (TCGA-LUSC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the LUSC cases.Please see the TCGA-LUSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
}
Expand Down Expand Up @@ -298,6 +327,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Prostate", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "Note: This collection has special restrictions on its usage. See Data Usage Policies and Restrictions. The Cancer Imaging Program (CIP) is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this image collection of prostate adenocarcinoma (PRAD) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome. Please see the TCGA-PRAD wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_blca": {
Expand Down Expand Up @@ -332,6 +366,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Bladder", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Genome Atlas-Bladder Endothelial Carcinoma (TCGA-BLCA) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP), with the cooperation of several of the TCGA tissue-contributing institutions, has archived a large portion of the radiological images of the genetically-analyzed BLCA cases.Please see the TCGA-BLCA wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_ucec": {
Expand Down Expand Up @@ -366,6 +405,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Uterus", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Genome Atlas-Uterine Corpus Endometrial Carcinoma (TCGA-UCEC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the genetically-analyzed UCEC cases.Please see the TCGA-UCEC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_hnsc": {
Expand Down Expand Up @@ -400,6 +444,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Head-Neck", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Imaging Program (CIP) is working directly with primary investigators from institutes participating in TCGA to obtain and load images relating to the genomic, clinical, and pathological data being stored within the TCGA Data Portal. Currently this large PET/CT multi-sequence image collection of head and neck squamous cell carcinoma (HNSC) patients can be matched by each unique case identifier with the extensive gene and expression data of the same case from The Cancer Genome Atlas Data Portal to research the link between clinical phenome and tissue genome. Please see the TCGA -HNSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
"tcga_lusc": {
Expand Down Expand Up @@ -434,6 +483,11 @@ def test_get_metadata_cidc():
{"name": "Clinical, Genomics", "category": "data_type"},
{"name": "Lung", "category": "primary_site"},
],
"study_metadata": {
"minimal_info": {
"alternative_study_description": "The Cancer Genome Atlas-Lung Squamous Cell Carcinoma (TCGA-LUSC) data collection is part of a larger effort to enhance the TCGA http://cancergenome.nih.gov/ data set with characterized radiological images. The Cancer Imaging Program (CIP) with the cooperation of several of the TCGA tissue-contributing institutions are working to archive a large portion of the radiological images of the LUSC cases.Please see the TCGA-LUSC wiki page to learn more about the images and to obtain any supporting metadata for this collection."
}
},
},
},
}
Loading

0 comments on commit d796552

Please sign in to comment.