ENH: Use DF.align instead of matchdf: close #172!

Turns out this is a ton more efficient. Added bonus of now relying on pandas' implementation of this instead of ours. Turns out transposing huge dataframes is a pretty significant endeavor, so calling .T on the feature table for like the EMP dataset was taking a super long time. Fortunately, we can finesse our way around this by instead transposing the sample metadata and then aligning on the columns. I'm glad that we reached a solution for this that preserved all of the matching-up-front niceness re: testing. Solid stuff. Uh, next up are #171 and then #58? But we can def merge this back into master now.
biocore · Jul 3, 2019 · b8c90ba · b8c90ba
1 parent f1438cd
commit b8c90ba
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 95 deletions.
diff --git a/qurro/_df_utils.py b/qurro/_df_utils.py
@@ -11,16 +11,6 @@
 import pandas as pd
 
 
-def matchdf(df1, df2):
-    """Filters both DataFrames to just the rows of their shared indices.
-
-       Derived from gneiss.util.match() (https://github.com/biocore/gneiss).
-    """
-
-    idx = set(df1.index) & set(df2.index)
-    return df1.loc[idx], df2.loc[idx]
-
-
 def ensure_df_headers_unique(df, df_name):
     """Raises an error if the index or columns of the DataFrame aren't unique.
 
@@ -193,6 +183,14 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
             should correspond to observations (i.e. features), and the columns
             should correspond to samples.
 
+            Note that the input BIOM table might contain features or samples
+            that are not included in feature_ranks or sample_metadata,
+            respectively -- this is totally fine. The opposite, though, is
+            where things get to be a problem: if any of the features in
+            feature_ranks are not present in the table, or if all of the
+            samples in sample_metadata are not in the table, then this will
+            raise errors.
+
        feature_ranks: pd.DataFrame
             A DataFrame describing features' "ranks" along ranking(s). The
             index of this DataFrame should correspond to feature IDs, and the
@@ -224,14 +222,10 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
        If all of the samples described in sample_metadata are not present
        in the table, this will raise a ValueError.
     """
-    logging.debug("Starting matching table with feature/sample data.")
-    # Match features to BIOM table, and then match samples to BIOM table.
-    # This should bring us to a point where every feature/sample is
-    # supported in the BIOM table. (Note that the input BIOM table might
-    # contain features or samples that are not included in feature_ranks or
-    # sample_metadata, respectively -- this is totally fine. The opposite,
-    # though, is a big no-no.)
-    featurefiltered_table, m_feature_ranks = matchdf(table, feature_ranks)
+    logging.debug("Starting matching table with feature rankings.")
+    featurefiltered_table, m_feature_ranks = table.align(
+        feature_ranks, axis="index", join="inner"
+    )
     logging.debug("Matching table with feature ranks done.")
     # Ensure that every ranked feature was present in the BIOM table. Raise an
     # error if this isn't the case.
@@ -250,11 +244,23 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
             )
         )
 
-    logging.debug("Starting matching table with sample metadata.")
-    m_table_transpose, m_sample_metadata = matchdf(
-        featurefiltered_table.T, sample_metadata
+    # We transpose the sample metadata instead of the actual table because
+    # transposing in pandas, at least from some personal testing, can be really
+    # expensive for huge (EMP-scale) DataFrames. Since sample metadata will
+    # generally be smaller than the actual table, we transpose that.
+    logging.debug(
+        "Temporarily transposing sample metadata to make matching easier."
+    )
+    sample_metadata_transposed = sample_metadata.T
+    logging.debug("Transposing done.")
+    logging.debug("Starting matching table with (tranposed) sample metadata.")
+    m_table, m_sample_metadata_transposed = featurefiltered_table.align(
+        sample_metadata_transposed, axis="columns", join="inner"
     )
     logging.debug("Matching table with sample metadata done.")
+    logging.debug("Transposing sample metadata again to reset it.")
+    m_sample_metadata = m_sample_metadata_transposed.T
+    logging.debug("Transposing done.")
     # Allow for dropped samples (e.g. negative controls), but ensure that at
     # least one sample is supported by the BIOM table.
     if m_sample_metadata.shape[0] < 1:
@@ -270,10 +276,7 @@ def match_table_and_data(table, feature_ranks, sample_metadata):
             "present in the BIOM table, and have been removed from the "
             "visualization.".format(dropped_sample_ct)
         )
-    # We return the transpose of the transposed table, so the table should have
-    # the same "orientation" (i.e. columns are samples, rows (indices) are
-    # features) as the input table.
-    return m_table_transpose.T, m_sample_metadata
+    return m_table, m_sample_metadata
 
 
 def merge_feature_metadata(feature_ranks, feature_metadata=None):

diff --git a/qurro/tests/test_df_utils.py b/qurro/tests/test_df_utils.py
@@ -3,83 +3,13 @@
 from pandas.testing import assert_frame_equal, assert_series_equal
 import numpy as np
 from qurro._df_utils import (
-    matchdf,
     ensure_df_headers_unique,
     validate_df,
     replace_nan,
     remove_empty_samples,
 )
 
 
-def test_matchdf():
-    """Tests the matchdf() function."""
-
-    df1 = DataFrame(
-        {
-            "col1": [1, 2, 3, 4, 5],
-            "col2": [6, 7, 8, 9, 10],
-            "col3": [11, 12, 13, 14, 15],
-        },
-        index=["a", "b", "c", "d", "e"],
-    )
-    df2 = DataFrame(
-        {
-            "colA": [5, 4, 3, 2, 1],
-            "colB": [10, 9, 8, 7, 6],
-            "colC": [15, 14, 13, 12, 11],
-            "colD": ["q", "w", "e", "r", "t"],
-        },
-        index=["a", "c", "d", "x", "y"],
-    )
-    df3 = DataFrame(index=["a", "x"])
-    df4 = DataFrame(index=["x"])
-
-    # The ground truth DF from matching dfX with dfY is named dfXY
-    df12 = DataFrame(
-        {"col1": [1, 3, 4], "col2": [6, 8, 9], "col3": [11, 13, 14]},
-        index=["a", "c", "d"],
-    )
-    df21 = DataFrame(
-        {
-            "colA": [5, 4, 3],
-            "colB": [10, 9, 8],
-            "colC": [15, 14, 13],
-            "colD": ["q", "w", "e"],
-        },
-        index=["a", "c", "d"],
-    )
-    df13 = DataFrame({"col1": [1], "col2": [6], "col3": [11]}, index=["a"])
-    df31 = DataFrame(index=["a"])
-    # we need to specify a dtype of "int64" here because pandas, by default,
-    # infers that df14's dtype is just "object"; however, the result of
-    # matching df1 and df4 will have an "int64" dtype (since df1 already has
-    # an inferred "int64" dtype).
-    df14 = DataFrame(columns=["col1", "col2", "col3"]).astype("int64")
-    df41 = DataFrame()
-
-    # Basic testing: ensure that matching results match up with the ground
-    # truths
-    A, B = matchdf(df1, df2)
-    assert_frame_equal(A, df12, check_like=True)
-    assert_frame_equal(B, df21, check_like=True)
-
-    # Test "commutativity" of matchdf() -- reversing the DFs' orders shouldn't
-    # change the matching results (aside from the output order, of course)
-    A, B = matchdf(df2, df1)
-    assert_frame_equal(A, df21, check_like=True)
-    assert_frame_equal(B, df12, check_like=True)
-
-    # Test that matching with empty DFs works as expected
-    # First, try matching in the case where at least one index name matches
-    A, B = matchdf(df1, df3)
-    assert_frame_equal(A, df13, check_like=True)
-    assert_frame_equal(B, df31, check_like=True)
-    # Next, try matching in the case where there's no overlap in index names
-    A, B = matchdf(df1, df4)
-    assert_frame_equal(A, df14, check_like=True)
-    assert_frame_equal(B, df41, check_like=True)
-
-
 def test_ensure_df_headers_unique():
     """Tests the ensure_df_headers_unique() function in generate.py."""