Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH:Allow for missing features in feature metadata #734

Merged
merged 3 commits into from
Oct 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 38 additions & 23 deletions emperor/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,11 @@ class Emperor(object):
A list of the OrdinationResults objects with the same sample
identifiers as the identifiers in ``ordination``.
ignore_missing_samples: bool, optional
If set to `True` samples without metadata are included by setting all
metadata values to: ``This sample has not metadata``. By default an
exception will be raised if missing samples are encountered. Note, this
flag only takes effect if there's at least one overlapping sample.
If set to `True` samples and features without metadata are included by
setting all metadata values to: ``This element has no metadata``. By
default an exception will be raised if missing elements are
encountered. Note, this flag only takes effect if there's at least one
overlapping element.

Attributes
----------
Expand Down Expand Up @@ -250,7 +251,7 @@ def __init__(self, ordination, mapping_file, feature_mapping_file=None,
self.feature_mf = \
self._validate_metadata(feature_mapping_file,
self.ordination.features,
ignore_missing_samples=False)
ignore_missing_samples, kind='feature')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is awesome!


self._validate_ordinations()

Expand Down Expand Up @@ -302,40 +303,54 @@ def _repr_html_(self):

return display(HTML(str(self)))

def _validate_metadata(self, metadata, matrix, ignore_missing_samples):
def _validate_metadata(self, metadata, matrix, ignore_missing_samples,
kind='sample'):

if kind not in {'sample', 'feature'}:
raise ValueError('Unsupported "kind" value %s' % kind)

# metadata is optional for biplots, so we just create an empty table
if metadata is None:
metadata = pd.DataFrame(index=pd.Index(matrix.index, name='id'))
metadata['all'] = 'All objects'
metadata['all'] = 'All elements'
return metadata

ordination_samples = set(matrix.index)
difference = ordination_samples - set(metadata.index)
ordination_elements = set(matrix.index)
difference = ordination_elements - set(metadata.index)

if difference == ordination_samples:
raise ValueError('None of the sample identifiers match between the'
if difference == ordination_elements:
raise ValueError('None of the %s identifiers match between the'
' metadata and the coordinates. Verify that you '
'are using metadata and coordinates corresponding'
' to the same dataset.')
' to the same dataset.' % kind)

if difference and not ignore_missing_samples:
raise KeyError("There are samples not included in the mapping "
# sort the elements so we have a deterministic output
difference = sorted([str(i) for i in difference])

# if there's more than 5 missing elements, truncate the list
if len(difference) > 5:
elements = ', '.join(difference[:5])
suffix = ("Showing only the first 5 %ss out of %d: %s ..." %
(kind, len(difference), elements))
else:
elements = ', '.join(difference)
suffix = ("Offending %ss: %s" % (kind, elements))

raise KeyError("There are %ss not included in the %s mapping "
"file. Override this error by using the "
"`ignore_missing_samples` argument. Offending "
"samples: %s"
% ', '.join(sorted([str(i) for i in difference])))
"`ignore_missing_samples` argument. %s" %
(kind, kind, suffix))
elif difference and ignore_missing_samples:
warnings.warn("%d out of %d samples have no metadata and are being"
warnings.warn("%d out of %d %ss have no metadata and are being"
" included with a placeholder value." %
(len(difference), len(ordination_samples)),
(len(difference), len(ordination_elements), kind),
EmperorWarning)

# pad the missing samples
data = np.full((len(difference), metadata.shape[1]),
'This sample has no metadata', dtype='<U27')
pad = pd.DataFrame(index=difference, columns=self.mf.columns,
data=data)
# pad the missing elements
pad = pd.DataFrame(index=difference, columns=metadata.columns,
dtype=str)
pad.fillna('This element has no metadata', inplace=True)
metadata = pd.concat([metadata, pad])

# filter all metadata that we may have for which we don't have any
Expand Down
89 changes: 74 additions & 15 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_initial_biplots_no_metadata(self):

empty_mf = pd.DataFrame(index=['f.PC.636', 'f.PC.635', 'f.PC.356',
'f.PC.481', 'f.PC.354'])
empty_mf['all'] = 'All objects'
empty_mf['all'] = 'All elements'

pd.util.testing.assert_frame_equal(empty_mf, emp.feature_mf,
check_names=False)
Expand All @@ -241,22 +241,48 @@ def test_one_dimensional(self):
"two dimensions are not supported"):
Emperor(self.ord_res, self.mf, remote=False)

def test_initial_unbalanced_more_than_five(self):
mf = self.mf.copy()
mf.drop(['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.607', 'PC.636'],
inplace=True)
with self.assertRaisesRegexp(KeyError, "There are samples not "
"included in the sample mapping file. "
"Override this error by using the "
"`ignore_missing_samples` argument. "
"Showing only the first 5 samples out of "
"6: PC.354, PC.355, PC.356, PC.481, "
"PC.607 ..."):
Emperor(self.ord_res, mf, remote=self.url)

def test_initial_unbalanced(self):
self.mf.drop(['PC.354'], inplace=True)
mf = self.mf.copy()
mf.drop(['PC.354'], inplace=True)
with self.assertRaisesRegexp(KeyError, "There are samples not "
"included in the mapping file. Override "
"this error by using the "
"included in the sample mapping file. "
"Override this error by using the "
"`ignore_missing_samples` argument. "
"Offending samples: PC.354"):
Emperor(self.ord_res, self.mf, remote=self.url)
Emperor(self.ord_res, mf, remote=self.url)

# test feature metadata
fmf = self.feature_mf.copy()
fmf.drop(['f.PC.636'], inplace=True)
with self.assertRaisesRegexp(KeyError, "There are features not "
"included in the feature mapping file. "
"Override this error by using the "
"`ignore_missing_samples` argument. "
"Offending features: f.PC.636"):
Emperor(self.biplot, self.mf, fmf, remote=self.url)

def test_initial_unbalanced_ignore(self):
expected = self.mf.copy()
self.mf.drop(['PC.634'], inplace=True)

mf = self.mf.copy()
mf.drop(['PC.634'], inplace=True)

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
emp = Emperor(self.ord_res, self.mf, remote=self.url,
emp = Emperor(self.ord_res, mf, remote=self.url,
ignore_missing_samples=True)

self.assertTrue(len(w) == 1)
Expand All @@ -265,21 +291,54 @@ def test_initial_unbalanced_ignore(self):
"being included with a placeholder value.",
str(w[-1].message))

expected.loc['PC.634'] = ['This sample has no metadata'] * 3
expected.loc['PC.634'] = ['This element has no metadata'] * 3

pd.util.testing.assert_frame_equal(expected.sort_index(),
emp.mf.sort_index(),
check_names=False)

expected = self.feature_mf.copy()

fmf = self.feature_mf.copy()
fmf.drop(['f.PC.636'], inplace=True)

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
emp = Emperor(self.biplot, self.mf, fmf, remote=self.url,
ignore_missing_samples=True)

self.assertTrue(len(w) == 1)
self.assertTrue(issubclass(w[-1].category, EmperorWarning))
self.assertEqual("1 out of 5 features have no metadata and are "
"being included with a placeholder value.",
str(w[-1].message))

expected.loc['f.PC.636'] = ['This element has no metadata'] * 2

pd.util.testing.assert_frame_equal(expected.sort_index(),
emp.feature_mf.sort_index(),
check_names=False)

def test_no_overlap(self):
self.mf.index = self.mf.index + '.not'
mf = self.mf.copy()
mf.index = mf.index + '.not'

with self.assertRaisesRegexp(ValueError, 'None of the sample '
'identifiers match between the metadata '
'and the coordinates. Verify that you are'
' using metadata and coordinates '
'corresponding to the same dataset.'):
Emperor(self.ord_res, self.mf, remote=self.url)
Emperor(self.ord_res, mf, remote=self.url)

fmf = self.feature_mf.copy()
fmf.index = fmf.index + '.not'

with self.assertRaisesRegexp(ValueError, 'None of the feature '
'identifiers match between the metadata '
'and the coordinates. Verify that you are'
' using metadata and coordinates '
'corresponding to the same dataset.'):
Emperor(self.biplot, self.mf, fmf, remote=self.url)

def test_get_template(self):
emp = Emperor(self.ord_res, self.mf, remote=False)
Expand Down Expand Up @@ -504,11 +563,11 @@ def test_process_data_biplots_no_metadata(self):
['f.PC.636', 'f.PC.635', 'f.PC.356', 'f.PC.481',
'f.PC.354'])
self.assertEqual(bi_headers, ['id', 'all'])
self.assertEqual(bi_metadata, [['f.PC.636', 'All objects'],
['f.PC.635', 'All objects'],
['f.PC.356', 'All objects'],
['f.PC.481', 'All objects'],
['f.PC.354', 'All objects']])
self.assertEqual(bi_metadata, [['f.PC.636', 'All elements'],
['f.PC.635', 'All elements'],
['f.PC.356', 'All elements'],
['f.PC.481', 'All elements'],
['f.PC.354', 'All elements']])

def test_process_data_custom_axes(self):
emp = Emperor(self.ord_res, self.mf, remote=False)
Expand Down