From 490afbd16baac2ddce38b8eb54f6546ba531e754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yoshiki=20V=C3=A1zquez=20Baeza?= Date: Tue, 10 Sep 2019 17:56:45 -0700 Subject: [PATCH 1/3] ENH:Allow for missing features in feature metadata Allows for feature metadata to not include some of the features represented in the ordination. The errors and warnings have been updated to properly refer to either features or samples. I've added a couple of tests to validate the error/warning messages. Fixes #730 Fixes #733 --- emperor/core.py | 45 +++++++++++++++------------ tests/test_core.py | 76 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 86 insertions(+), 35 deletions(-) diff --git a/emperor/core.py b/emperor/core.py index deacb064..da50fd46 100644 --- a/emperor/core.py +++ b/emperor/core.py @@ -95,10 +95,11 @@ class Emperor(object): A list of the OrdinationResults objects with the same sample identifiers as the identifiers in ``ordination``. ignore_missing_samples: bool, optional - If set to `True` samples without metadata are included by setting all - metadata values to: ``This sample has not metadata``. By default an - exception will be raised if missing samples are encountered. Note, this - flag only takes effect if there's at least one overlapping sample. + If set to `True` samples and features without metadata are included by + setting all metadata values to: ``This element has not metadata``. By + default an exception will be raised if missing elements are + encountered. Note, this flag only takes effect if there's at least one + overlapping element. Attributes ---------- @@ -250,7 +251,7 @@ def __init__(self, ordination, mapping_file, feature_mapping_file=None, self.feature_mf = \ self._validate_metadata(feature_mapping_file, self.ordination.features, - ignore_missing_samples=False) + ignore_missing_samples, kind='feature') self._validate_ordinations() @@ -302,39 +303,43 @@ def _repr_html_(self): return display(HTML(str(self))) - def _validate_metadata(self, metadata, matrix, ignore_missing_samples): + def _validate_metadata(self, metadata, matrix, ignore_missing_samples, + kind='sample'): + + if kind not in {'sample', 'feature'}: + raise ValueError('Unsupported "kind" value %s' % kind) # metadata is optional for biplots, so we just create an empty table if metadata is None: metadata = pd.DataFrame(index=pd.Index(matrix.index, name='id')) - metadata['all'] = 'All objects' + metadata['all'] = 'All elements' return metadata - ordination_samples = set(matrix.index) - difference = ordination_samples - set(metadata.index) + ordination_elements = set(matrix.index) + difference = ordination_elements - set(metadata.index) - if difference == ordination_samples: - raise ValueError('None of the sample identifiers match between the' + if difference == ordination_elements: + raise ValueError('None of the %s identifiers match between the' ' metadata and the coordinates. Verify that you ' 'are using metadata and coordinates corresponding' - ' to the same dataset.') + ' to the same dataset.' % kind) if difference and not ignore_missing_samples: - raise KeyError("There are samples not included in the mapping " + elements = ', '.join(sorted([str(i) for i in difference])) + raise KeyError("There are %ss not included in the %s mapping " "file. Override this error by using the " "`ignore_missing_samples` argument. Offending " - "samples: %s" - % ', '.join(sorted([str(i) for i in difference]))) + "%ss: %s" % (kind, kind, kind, elements)) elif difference and ignore_missing_samples: - warnings.warn("%d out of %d samples have no metadata and are being" + warnings.warn("%d out of %d %ss have no metadata and are being" " included with a placeholder value." % - (len(difference), len(ordination_samples)), + (len(difference), len(ordination_elements), kind), EmperorWarning) - # pad the missing samples + # pad the missing elements data = np.full((len(difference), metadata.shape[1]), - 'This sample has no metadata', dtype=' Date: Wed, 2 Oct 2019 13:41:30 -0700 Subject: [PATCH 2/3] Address reviews from @wasade --- emperor/core.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/emperor/core.py b/emperor/core.py index da50fd46..b5684e0d 100644 --- a/emperor/core.py +++ b/emperor/core.py @@ -96,7 +96,7 @@ class Emperor(object): identifiers as the identifiers in ``ordination``. ignore_missing_samples: bool, optional If set to `True` samples and features without metadata are included by - setting all metadata values to: ``This element has not metadata``. By + setting all metadata values to: ``This element has no metadata``. By default an exception will be raised if missing elements are encountered. Note, this flag only takes effect if there's at least one overlapping element. @@ -337,10 +337,9 @@ def _validate_metadata(self, metadata, matrix, ignore_missing_samples, EmperorWarning) # pad the missing elements - data = np.full((len(difference), metadata.shape[1]), - 'This element has no metadata', dtype=' Date: Mon, 21 Oct 2019 23:45:54 -0700 Subject: [PATCH 3/3] ENH: Address @wasade's comment to truncate outptut The output is truncated to 5 missing elements. --- emperor/core.py | 17 ++++++++++++++--- tests/test_core.py | 13 +++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/emperor/core.py b/emperor/core.py index b5684e0d..c981d6d0 100644 --- a/emperor/core.py +++ b/emperor/core.py @@ -325,11 +325,22 @@ def _validate_metadata(self, metadata, matrix, ignore_missing_samples, ' to the same dataset.' % kind) if difference and not ignore_missing_samples: - elements = ', '.join(sorted([str(i) for i in difference])) + # sort the elements so we have a deterministic output + difference = sorted([str(i) for i in difference]) + + # if there's more than 5 missing elements, truncate the list + if len(difference) > 5: + elements = ', '.join(difference[:5]) + suffix = ("Showing only the first 5 %ss out of %d: %s ..." % + (kind, len(difference), elements)) + else: + elements = ', '.join(difference) + suffix = ("Offending %ss: %s" % (kind, elements)) + raise KeyError("There are %ss not included in the %s mapping " "file. Override this error by using the " - "`ignore_missing_samples` argument. Offending " - "%ss: %s" % (kind, kind, kind, elements)) + "`ignore_missing_samples` argument. %s" % + (kind, kind, suffix)) elif difference and ignore_missing_samples: warnings.warn("%d out of %d %ss have no metadata and are being" " included with a placeholder value." % diff --git a/tests/test_core.py b/tests/test_core.py index 6956ce3c..2155445d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -241,6 +241,19 @@ def test_one_dimensional(self): "two dimensions are not supported"): Emperor(self.ord_res, self.mf, remote=False) + def test_initial_unbalanced_more_than_five(self): + mf = self.mf.copy() + mf.drop(['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.607', 'PC.636'], + inplace=True) + with self.assertRaisesRegexp(KeyError, "There are samples not " + "included in the sample mapping file. " + "Override this error by using the " + "`ignore_missing_samples` argument. " + "Showing only the first 5 samples out of " + "6: PC.354, PC.355, PC.356, PC.481, " + "PC.607 ..."): + Emperor(self.ord_res, mf, remote=self.url) + def test_initial_unbalanced(self): mf = self.mf.copy() mf.drop(['PC.354'], inplace=True)