Skip to content

Commit

Permalink
ENH: Use DF.align instead of matchdf: close #172!
Browse files Browse the repository at this point in the history
Turns out this is a ton more efficient. Added bonus of now relying
on pandas' implementation of this instead of ours.

Turns out transposing huge dataframes is a pretty significant
endeavor, so calling .T on the feature table for like the EMP dataset
was taking a super long time. Fortunately, we can finesse our way
around this by instead transposing the sample metadata and then
aligning on the columns.

I'm glad that we reached a solution for this that preserved
all of the matching-up-front niceness re: testing. Solid stuff.

Uh, next up are #171 and then #58? But we can def merge this back
into master now.
  • Loading branch information
fedarko committed Jul 3, 2019
1 parent f1438cd commit b8c90ba
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 95 deletions.
53 changes: 28 additions & 25 deletions qurro/_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,6 @@
import pandas as pd


def matchdf(df1, df2):
"""Filters both DataFrames to just the rows of their shared indices.
Derived from gneiss.util.match() (https://github.com/biocore/gneiss).
"""

idx = set(df1.index) & set(df2.index)
return df1.loc[idx], df2.loc[idx]


def ensure_df_headers_unique(df, df_name):
"""Raises an error if the index or columns of the DataFrame aren't unique.
Expand Down Expand Up @@ -193,6 +183,14 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
should correspond to observations (i.e. features), and the columns
should correspond to samples.
Note that the input BIOM table might contain features or samples
that are not included in feature_ranks or sample_metadata,
respectively -- this is totally fine. The opposite, though, is
where things get to be a problem: if any of the features in
feature_ranks are not present in the table, or if all of the
samples in sample_metadata are not in the table, then this will
raise errors.
feature_ranks: pd.DataFrame
A DataFrame describing features' "ranks" along ranking(s). The
index of this DataFrame should correspond to feature IDs, and the
Expand Down Expand Up @@ -224,14 +222,10 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
If all of the samples described in sample_metadata are not present
in the table, this will raise a ValueError.
"""
logging.debug("Starting matching table with feature/sample data.")
# Match features to BIOM table, and then match samples to BIOM table.
# This should bring us to a point where every feature/sample is
# supported in the BIOM table. (Note that the input BIOM table might
# contain features or samples that are not included in feature_ranks or
# sample_metadata, respectively -- this is totally fine. The opposite,
# though, is a big no-no.)
featurefiltered_table, m_feature_ranks = matchdf(table, feature_ranks)
logging.debug("Starting matching table with feature rankings.")
featurefiltered_table, m_feature_ranks = table.align(
feature_ranks, axis="index", join="inner"
)
logging.debug("Matching table with feature ranks done.")
# Ensure that every ranked feature was present in the BIOM table. Raise an
# error if this isn't the case.
Expand All @@ -250,11 +244,23 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
)
)

logging.debug("Starting matching table with sample metadata.")
m_table_transpose, m_sample_metadata = matchdf(
featurefiltered_table.T, sample_metadata
# We transpose the sample metadata instead of the actual table because
# transposing in pandas, at least from some personal testing, can be really
# expensive for huge (EMP-scale) DataFrames. Since sample metadata will
# generally be smaller than the actual table, we transpose that.
logging.debug(
"Temporarily transposing sample metadata to make matching easier."
)
sample_metadata_transposed = sample_metadata.T
logging.debug("Transposing done.")
logging.debug("Starting matching table with (tranposed) sample metadata.")
m_table, m_sample_metadata_transposed = featurefiltered_table.align(
sample_metadata_transposed, axis="columns", join="inner"
)
logging.debug("Matching table with sample metadata done.")
logging.debug("Transposing sample metadata again to reset it.")
m_sample_metadata = m_sample_metadata_transposed.T
logging.debug("Transposing done.")
# Allow for dropped samples (e.g. negative controls), but ensure that at
# least one sample is supported by the BIOM table.
if m_sample_metadata.shape[0] < 1:
Expand All @@ -270,10 +276,7 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
"present in the BIOM table, and have been removed from the "
"visualization.".format(dropped_sample_ct)
)
# We return the transpose of the transposed table, so the table should have
# the same "orientation" (i.e. columns are samples, rows (indices) are
# features) as the input table.
return m_table_transpose.T, m_sample_metadata
return m_table, m_sample_metadata


def merge_feature_metadata(feature_ranks, feature_metadata=None):
Expand Down
70 changes: 0 additions & 70 deletions qurro/tests/test_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,83 +3,13 @@
from pandas.testing import assert_frame_equal, assert_series_equal
import numpy as np
from qurro._df_utils import (
matchdf,
ensure_df_headers_unique,
validate_df,
replace_nan,
remove_empty_samples,
)


def test_matchdf():
"""Tests the matchdf() function."""

df1 = DataFrame(
{
"col1": [1, 2, 3, 4, 5],
"col2": [6, 7, 8, 9, 10],
"col3": [11, 12, 13, 14, 15],
},
index=["a", "b", "c", "d", "e"],
)
df2 = DataFrame(
{
"colA": [5, 4, 3, 2, 1],
"colB": [10, 9, 8, 7, 6],
"colC": [15, 14, 13, 12, 11],
"colD": ["q", "w", "e", "r", "t"],
},
index=["a", "c", "d", "x", "y"],
)
df3 = DataFrame(index=["a", "x"])
df4 = DataFrame(index=["x"])

# The ground truth DF from matching dfX with dfY is named dfXY
df12 = DataFrame(
{"col1": [1, 3, 4], "col2": [6, 8, 9], "col3": [11, 13, 14]},
index=["a", "c", "d"],
)
df21 = DataFrame(
{
"colA": [5, 4, 3],
"colB": [10, 9, 8],
"colC": [15, 14, 13],
"colD": ["q", "w", "e"],
},
index=["a", "c", "d"],
)
df13 = DataFrame({"col1": [1], "col2": [6], "col3": [11]}, index=["a"])
df31 = DataFrame(index=["a"])
# we need to specify a dtype of "int64" here because pandas, by default,
# infers that df14's dtype is just "object"; however, the result of
# matching df1 and df4 will have an "int64" dtype (since df1 already has
# an inferred "int64" dtype).
df14 = DataFrame(columns=["col1", "col2", "col3"]).astype("int64")
df41 = DataFrame()

# Basic testing: ensure that matching results match up with the ground
# truths
A, B = matchdf(df1, df2)
assert_frame_equal(A, df12, check_like=True)
assert_frame_equal(B, df21, check_like=True)

# Test "commutativity" of matchdf() -- reversing the DFs' orders shouldn't
# change the matching results (aside from the output order, of course)
A, B = matchdf(df2, df1)
assert_frame_equal(A, df21, check_like=True)
assert_frame_equal(B, df12, check_like=True)

# Test that matching with empty DFs works as expected
# First, try matching in the case where at least one index name matches
A, B = matchdf(df1, df3)
assert_frame_equal(A, df13, check_like=True)
assert_frame_equal(B, df31, check_like=True)
# Next, try matching in the case where there's no overlap in index names
A, B = matchdf(df1, df4)
assert_frame_equal(A, df14, check_like=True)
assert_frame_equal(B, df41, check_like=True)


def test_ensure_df_headers_unique():
"""Tests the ensure_df_headers_unique() function in generate.py."""

Expand Down

0 comments on commit b8c90ba

Please sign in to comment.