Skip to content

Commit

Permalink
Merge pull request #734 from ElDeveloper/issue-731
Browse files Browse the repository at this point in the history
ENH:Allow for missing features in feature metadata
  • Loading branch information
wasade authored Oct 22, 2019
2 parents 886e42c + 9b4c0bb commit 8a530bc
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 38 deletions.
61 changes: 38 additions & 23 deletions emperor/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,11 @@ class Emperor(object):
A list of the OrdinationResults objects with the same sample
identifiers as the identifiers in ``ordination``.
ignore_missing_samples: bool, optional
If set to `True` samples without metadata are included by setting all
metadata values to: ``This sample has not metadata``. By default an
exception will be raised if missing samples are encountered. Note, this
flag only takes effect if there's at least one overlapping sample.
If set to `True` samples and features without metadata are included by
setting all metadata values to: ``This element has no metadata``. By
default an exception will be raised if missing elements are
encountered. Note, this flag only takes effect if there's at least one
overlapping element.
Attributes
----------
Expand Down Expand Up @@ -250,7 +251,7 @@ def __init__(self, ordination, mapping_file, feature_mapping_file=None,
self.feature_mf = \
self._validate_metadata(feature_mapping_file,
self.ordination.features,
ignore_missing_samples=False)
ignore_missing_samples, kind='feature')

self._validate_ordinations()

Expand Down Expand Up @@ -302,40 +303,54 @@ def _repr_html_(self):

return display(HTML(str(self)))

def _validate_metadata(self, metadata, matrix, ignore_missing_samples):
def _validate_metadata(self, metadata, matrix, ignore_missing_samples,
kind='sample'):

if kind not in {'sample', 'feature'}:
raise ValueError('Unsupported "kind" value %s' % kind)

# metadata is optional for biplots, so we just create an empty table
if metadata is None:
metadata = pd.DataFrame(index=pd.Index(matrix.index, name='id'))
metadata['all'] = 'All objects'
metadata['all'] = 'All elements'
return metadata

ordination_samples = set(matrix.index)
difference = ordination_samples - set(metadata.index)
ordination_elements = set(matrix.index)
difference = ordination_elements - set(metadata.index)

if difference == ordination_samples:
raise ValueError('None of the sample identifiers match between the'
if difference == ordination_elements:
raise ValueError('None of the %s identifiers match between the'
' metadata and the coordinates. Verify that you '
'are using metadata and coordinates corresponding'
' to the same dataset.')
' to the same dataset.' % kind)

if difference and not ignore_missing_samples:
raise KeyError("There are samples not included in the mapping "
# sort the elements so we have a deterministic output
difference = sorted([str(i) for i in difference])

# if there's more than 5 missing elements, truncate the list
if len(difference) > 5:
elements = ', '.join(difference[:5])
suffix = ("Showing only the first 5 %ss out of %d: %s ..." %
(kind, len(difference), elements))
else:
elements = ', '.join(difference)
suffix = ("Offending %ss: %s" % (kind, elements))

raise KeyError("There are %ss not included in the %s mapping "
"file. Override this error by using the "
"`ignore_missing_samples` argument. Offending "
"samples: %s"
% ', '.join(sorted([str(i) for i in difference])))
"`ignore_missing_samples` argument. %s" %
(kind, kind, suffix))
elif difference and ignore_missing_samples:
warnings.warn("%d out of %d samples have no metadata and are being"
warnings.warn("%d out of %d %ss have no metadata and are being"
" included with a placeholder value." %
(len(difference), len(ordination_samples)),
(len(difference), len(ordination_elements), kind),
EmperorWarning)

# pad the missing samples
data = np.full((len(difference), metadata.shape[1]),
'This sample has no metadata', dtype='<U27')
pad = pd.DataFrame(index=difference, columns=self.mf.columns,
data=data)
# pad the missing elements
pad = pd.DataFrame(index=difference, columns=metadata.columns,
dtype=str)
pad.fillna('This element has no metadata', inplace=True)
metadata = pd.concat([metadata, pad])

# filter all metadata that we may have for which we don't have any
Expand Down
89 changes: 74 additions & 15 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_initial_biplots_no_metadata(self):

empty_mf = pd.DataFrame(index=['f.PC.636', 'f.PC.635', 'f.PC.356',
'f.PC.481', 'f.PC.354'])
empty_mf['all'] = 'All objects'
empty_mf['all'] = 'All elements'

pd.util.testing.assert_frame_equal(empty_mf, emp.feature_mf,
check_names=False)
Expand All @@ -241,22 +241,48 @@ def test_one_dimensional(self):
"two dimensions are not supported"):
Emperor(self.ord_res, self.mf, remote=False)

def test_initial_unbalanced_more_than_five(self):
mf = self.mf.copy()
mf.drop(['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.607', 'PC.636'],
inplace=True)
with self.assertRaisesRegexp(KeyError, "There are samples not "
"included in the sample mapping file. "
"Override this error by using the "
"`ignore_missing_samples` argument. "
"Showing only the first 5 samples out of "
"6: PC.354, PC.355, PC.356, PC.481, "
"PC.607 ..."):
Emperor(self.ord_res, mf, remote=self.url)

def test_initial_unbalanced(self):
self.mf.drop(['PC.354'], inplace=True)
mf = self.mf.copy()
mf.drop(['PC.354'], inplace=True)
with self.assertRaisesRegexp(KeyError, "There are samples not "
"included in the mapping file. Override "
"this error by using the "
"included in the sample mapping file. "
"Override this error by using the "
"`ignore_missing_samples` argument. "
"Offending samples: PC.354"):
Emperor(self.ord_res, self.mf, remote=self.url)
Emperor(self.ord_res, mf, remote=self.url)

# test feature metadata
fmf = self.feature_mf.copy()
fmf.drop(['f.PC.636'], inplace=True)
with self.assertRaisesRegexp(KeyError, "There are features not "
"included in the feature mapping file. "
"Override this error by using the "
"`ignore_missing_samples` argument. "
"Offending features: f.PC.636"):
Emperor(self.biplot, self.mf, fmf, remote=self.url)

def test_initial_unbalanced_ignore(self):
expected = self.mf.copy()
self.mf.drop(['PC.634'], inplace=True)

mf = self.mf.copy()
mf.drop(['PC.634'], inplace=True)

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
emp = Emperor(self.ord_res, self.mf, remote=self.url,
emp = Emperor(self.ord_res, mf, remote=self.url,
ignore_missing_samples=True)

self.assertTrue(len(w) == 1)
Expand All @@ -265,21 +291,54 @@ def test_initial_unbalanced_ignore(self):
"being included with a placeholder value.",
str(w[-1].message))

expected.loc['PC.634'] = ['This sample has no metadata'] * 3
expected.loc['PC.634'] = ['This element has no metadata'] * 3

pd.util.testing.assert_frame_equal(expected.sort_index(),
emp.mf.sort_index(),
check_names=False)

expected = self.feature_mf.copy()

fmf = self.feature_mf.copy()
fmf.drop(['f.PC.636'], inplace=True)

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
emp = Emperor(self.biplot, self.mf, fmf, remote=self.url,
ignore_missing_samples=True)

self.assertTrue(len(w) == 1)
self.assertTrue(issubclass(w[-1].category, EmperorWarning))
self.assertEqual("1 out of 5 features have no metadata and are "
"being included with a placeholder value.",
str(w[-1].message))

expected.loc['f.PC.636'] = ['This element has no metadata'] * 2

pd.util.testing.assert_frame_equal(expected.sort_index(),
emp.feature_mf.sort_index(),
check_names=False)

def test_no_overlap(self):
self.mf.index = self.mf.index + '.not'
mf = self.mf.copy()
mf.index = mf.index + '.not'

with self.assertRaisesRegexp(ValueError, 'None of the sample '
'identifiers match between the metadata '
'and the coordinates. Verify that you are'
' using metadata and coordinates '
'corresponding to the same dataset.'):
Emperor(self.ord_res, self.mf, remote=self.url)
Emperor(self.ord_res, mf, remote=self.url)

fmf = self.feature_mf.copy()
fmf.index = fmf.index + '.not'

with self.assertRaisesRegexp(ValueError, 'None of the feature '
'identifiers match between the metadata '
'and the coordinates. Verify that you are'
' using metadata and coordinates '
'corresponding to the same dataset.'):
Emperor(self.biplot, self.mf, fmf, remote=self.url)

def test_get_template(self):
emp = Emperor(self.ord_res, self.mf, remote=False)
Expand Down Expand Up @@ -504,11 +563,11 @@ def test_process_data_biplots_no_metadata(self):
['f.PC.636', 'f.PC.635', 'f.PC.356', 'f.PC.481',
'f.PC.354'])
self.assertEqual(bi_headers, ['id', 'all'])
self.assertEqual(bi_metadata, [['f.PC.636', 'All objects'],
['f.PC.635', 'All objects'],
['f.PC.356', 'All objects'],
['f.PC.481', 'All objects'],
['f.PC.354', 'All objects']])
self.assertEqual(bi_metadata, [['f.PC.636', 'All elements'],
['f.PC.635', 'All elements'],
['f.PC.356', 'All elements'],
['f.PC.481', 'All elements'],
['f.PC.354', 'All elements']])

def test_process_data_custom_axes(self):
emp = Emperor(self.ord_res, self.mf, remote=False)
Expand Down

0 comments on commit 8a530bc

Please sign in to comment.