biocore · wasade · Oct 22, 2019 · Sep 11, 2019 · Oct 2, 2019 · Oct 22, 2019
diff --git a/emperor/core.py b/emperor/core.py
@@ -95,10 +95,11 @@ class Emperor(object):
         A list of the OrdinationResults objects with the same sample
         identifiers as the identifiers in ``ordination``.
     ignore_missing_samples: bool, optional
-        If set to `True` samples without metadata are included by setting all
-        metadata values to: ``This sample has not metadata``. By default an
-        exception will be raised if missing samples are encountered. Note, this
-        flag only takes effect if there's at least one overlapping sample.
+        If set to `True` samples and features without metadata are included by
+        setting all metadata values to: ``This element has no metadata``. By
+        default an exception will be raised if missing elements are
+        encountered. Note, this flag only takes effect if there's at least one
+        overlapping element.
 
     Attributes
     ----------
@@ -250,7 +251,7 @@ def __init__(self, ordination, mapping_file, feature_mapping_file=None,
             self.feature_mf = \
                 self._validate_metadata(feature_mapping_file,
                                         self.ordination.features,
-                                        ignore_missing_samples=False)
+                                        ignore_missing_samples, kind='feature')
 
         self._validate_ordinations()
 
@@ -302,40 +303,54 @@ def _repr_html_(self):
 
         return display(HTML(str(self)))
 
-    def _validate_metadata(self, metadata, matrix, ignore_missing_samples):
+    def _validate_metadata(self, metadata, matrix, ignore_missing_samples,
+                           kind='sample'):
+
+        if kind not in {'sample', 'feature'}:
+            raise ValueError('Unsupported "kind" value %s' % kind)
 
         # metadata is optional for biplots, so we just create an empty table
         if metadata is None:
             metadata = pd.DataFrame(index=pd.Index(matrix.index, name='id'))
-            metadata['all'] = 'All objects'
+            metadata['all'] = 'All elements'
             return metadata
 
-        ordination_samples = set(matrix.index)
-        difference = ordination_samples - set(metadata.index)
+        ordination_elements = set(matrix.index)
+        difference = ordination_elements - set(metadata.index)
 
-        if difference == ordination_samples:
-            raise ValueError('None of the sample identifiers match between the'
+        if difference == ordination_elements:
+            raise ValueError('None of the %s identifiers match between the'
                              ' metadata and the coordinates. Verify that you '
                              'are using metadata and coordinates corresponding'
-                             ' to the same dataset.')
+                             ' to the same dataset.' % kind)
 
         if difference and not ignore_missing_samples:
-            raise KeyError("There are samples not included in the mapping "
+            # sort the elements so we have a deterministic output
+            difference = sorted([str(i) for i in difference])
+
+            # if there's more than 5 missing elements, truncate the list
+            if len(difference) > 5:
+                elements = ', '.join(difference[:5])
+                suffix = ("Showing only the first 5 %ss out of %d: %s ..." %
+                          (kind, len(difference), elements))
+            else:
+                elements = ', '.join(difference)
+                suffix = ("Offending %ss: %s" % (kind, elements))
+
+            raise KeyError("There are %ss not included in the %s mapping "
                            "file. Override this error by using the "
-                           "`ignore_missing_samples` argument. Offending "
-                           "samples: %s"
-                           % ', '.join(sorted([str(i) for i in difference])))
+                           "`ignore_missing_samples` argument. %s" %
+                           (kind, kind, suffix))
         elif difference and ignore_missing_samples:
-            warnings.warn("%d out of %d samples have no metadata and are being"
+            warnings.warn("%d out of %d %ss have no metadata and are being"
                           " included with a placeholder value." %
-                          (len(difference), len(ordination_samples)),
+                          (len(difference), len(ordination_elements), kind),
                           EmperorWarning)
 
-            # pad the missing samples
-            data = np.full((len(difference), metadata.shape[1]),
-                           'This sample has no metadata', dtype='<U27')
-            pad = pd.DataFrame(index=difference, columns=self.mf.columns,
-                               data=data)
+            # pad the missing elements
+            pad = pd.DataFrame(index=difference, columns=metadata.columns,
+                               dtype=str)
+            pad.fillna('This element has no metadata', inplace=True)
             metadata = pd.concat([metadata, pad])
 
         # filter all metadata that we may have for which we don't have any

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -223,7 +223,7 @@ def test_initial_biplots_no_metadata(self):
 
         empty_mf = pd.DataFrame(index=['f.PC.636', 'f.PC.635', 'f.PC.356',
                                        'f.PC.481', 'f.PC.354'])
-        empty_mf['all'] = 'All objects'
+        empty_mf['all'] = 'All elements'
 
         pd.util.testing.assert_frame_equal(empty_mf, emp.feature_mf,
                                            check_names=False)
@@ -241,22 +241,48 @@ def test_one_dimensional(self):
                                      "two dimensions are not supported"):
             Emperor(self.ord_res, self.mf, remote=False)
 
+    def test_initial_unbalanced_more_than_five(self):
+        mf = self.mf.copy()
+        mf.drop(['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.607', 'PC.636'],
+                inplace=True)
+        with self.assertRaisesRegexp(KeyError, "There are samples not "
+                                     "included in the sample mapping file. "
+                                     "Override this error by using the "
+                                     "`ignore_missing_samples` argument. "
+                                     "Showing only the first 5 samples out of "
+                                     "6: PC.354, PC.355, PC.356, PC.481, "
+                                     "PC.607 ..."):
+            Emperor(self.ord_res, mf, remote=self.url)
+
     def test_initial_unbalanced(self):
-        self.mf.drop(['PC.354'], inplace=True)
+        mf = self.mf.copy()
+        mf.drop(['PC.354'], inplace=True)
         with self.assertRaisesRegexp(KeyError, "There are samples not "
-                                     "included in the mapping file. Override "
-                                     "this error by using the "
+                                     "included in the sample mapping file. "
+                                     "Override this error by using the "
                                      "`ignore_missing_samples` argument. "
                                      "Offending samples: PC.354"):
-            Emperor(self.ord_res, self.mf, remote=self.url)
+            Emperor(self.ord_res, mf, remote=self.url)
+
+        # test feature metadata
+        fmf = self.feature_mf.copy()
+        fmf.drop(['f.PC.636'], inplace=True)
+        with self.assertRaisesRegexp(KeyError, "There are features not "
+                                     "included in the feature mapping file. "
+                                     "Override this error by using the "
+                                     "`ignore_missing_samples` argument. "
+                                     "Offending features: f.PC.636"):
+            Emperor(self.biplot, self.mf, fmf, remote=self.url)
 
     def test_initial_unbalanced_ignore(self):
         expected = self.mf.copy()
-        self.mf.drop(['PC.634'], inplace=True)
+
+        mf = self.mf.copy()
+        mf.drop(['PC.634'], inplace=True)
 
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter('always')
-            emp = Emperor(self.ord_res, self.mf, remote=self.url,
+            emp = Emperor(self.ord_res, mf, remote=self.url,
                           ignore_missing_samples=True)
 
             self.assertTrue(len(w) == 1)
@@ -265,21 +291,54 @@ def test_initial_unbalanced_ignore(self):
                              "being included with a placeholder value.",
                              str(w[-1].message))
 
-            expected.loc['PC.634'] = ['This sample has no metadata'] * 3
+            expected.loc['PC.634'] = ['This element has no metadata'] * 3
 
             pd.util.testing.assert_frame_equal(expected.sort_index(),
                                                emp.mf.sort_index(),
                                                check_names=False)
 
+        expected = self.feature_mf.copy()
+
+        fmf = self.feature_mf.copy()
+        fmf.drop(['f.PC.636'], inplace=True)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            emp = Emperor(self.biplot, self.mf, fmf, remote=self.url,
+                          ignore_missing_samples=True)
+
+            self.assertTrue(len(w) == 1)
+            self.assertTrue(issubclass(w[-1].category, EmperorWarning))
+            self.assertEqual("1 out of 5 features have no metadata and are "
+                             "being included with a placeholder value.",
+                             str(w[-1].message))
+
+            expected.loc['f.PC.636'] = ['This element has no metadata'] * 2
+
+            pd.util.testing.assert_frame_equal(expected.sort_index(),
+                                               emp.feature_mf.sort_index(),
+                                               check_names=False)
+
     def test_no_overlap(self):
-        self.mf.index = self.mf.index + '.not'
+        mf = self.mf.copy()
+        mf.index = mf.index + '.not'
 
         with self.assertRaisesRegexp(ValueError, 'None of the sample '
                                      'identifiers match between the metadata '
                                      'and the coordinates. Verify that you are'
                                      ' using metadata and coordinates '
                                      'corresponding to the same dataset.'):
-            Emperor(self.ord_res, self.mf, remote=self.url)
+            Emperor(self.ord_res, mf, remote=self.url)
+
+        fmf = self.feature_mf.copy()
+        fmf.index = fmf.index + '.not'
+
+        with self.assertRaisesRegexp(ValueError, 'None of the feature '
+                                     'identifiers match between the metadata '
+                                     'and the coordinates. Verify that you are'
+                                     ' using metadata and coordinates '
+                                     'corresponding to the same dataset.'):
+            Emperor(self.biplot, self.mf, fmf, remote=self.url)
 
     def test_get_template(self):
         emp = Emperor(self.ord_res, self.mf, remote=False)
@@ -504,11 +563,11 @@ def test_process_data_biplots_no_metadata(self):
                          ['f.PC.636', 'f.PC.635', 'f.PC.356', 'f.PC.481',
                           'f.PC.354'])
         self.assertEqual(bi_headers, ['id', 'all'])
-        self.assertEqual(bi_metadata, [['f.PC.636', 'All objects'],
-                                       ['f.PC.635', 'All objects'],
-                                       ['f.PC.356', 'All objects'],
-                                       ['f.PC.481', 'All objects'],
-                                       ['f.PC.354', 'All objects']])
+        self.assertEqual(bi_metadata, [['f.PC.636', 'All elements'],
+                                       ['f.PC.635', 'All elements'],
+                                       ['f.PC.356', 'All elements'],
+                                       ['f.PC.481', 'All elements'],
+                                       ['f.PC.354', 'All elements']])
 
     def test_process_data_custom_axes(self):
         emp = Emperor(self.ord_res, self.mf, remote=False)