Skip to content

Commit

Permalink
ENH: Speed up empty sample/feature removing
Browse files Browse the repository at this point in the history
relates to #58, #171.

now it's pretty quick to run the EMP with -x 2000 into Qurro. going
to do some more basic benchmarking with this.
  • Loading branch information
fedarko committed Jul 7, 2019
1 parent c175cc8 commit 1942df0
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 50 deletions.
12 changes: 9 additions & 3 deletions docs/demos/byrd/js/display.js
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
var dataName = samplePlotSpec.data.name;
var sid;
for (var s = 0; s < samplePlotSpec.datasets[dataName].length; s++) {
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"]
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"];
if (sid !== undefined) {
sampleIDs.push(sid);
}
Expand Down Expand Up @@ -1059,8 +1059,14 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
updateBalanceSingle(sampleRow) {
var sampleID = sampleRow["Sample ID"];
this.validateSampleID(sampleID);
var topCt = this.getCount(this.newFeatureHigh["Feature ID"], sampleID);
var botCt = this.getCount(this.newFeatureLow["Feature ID"], sampleID);
var topCt = this.getCount(
this.newFeatureHigh["Feature ID"],
sampleID
);
var botCt = this.getCount(
this.newFeatureLow["Feature ID"],
sampleID
);
return feature_computation.computeBalance(topCt, botCt);
}

Expand Down
12 changes: 9 additions & 3 deletions docs/demos/q2_moving_pictures/js/display.js
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
var dataName = samplePlotSpec.data.name;
var sid;
for (var s = 0; s < samplePlotSpec.datasets[dataName].length; s++) {
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"]
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"];
if (sid !== undefined) {
sampleIDs.push(sid);
}
Expand Down Expand Up @@ -1059,8 +1059,14 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
updateBalanceSingle(sampleRow) {
var sampleID = sampleRow["Sample ID"];
this.validateSampleID(sampleID);
var topCt = this.getCount(this.newFeatureHigh["Feature ID"], sampleID);
var botCt = this.getCount(this.newFeatureLow["Feature ID"], sampleID);
var topCt = this.getCount(
this.newFeatureHigh["Feature ID"],
sampleID
);
var botCt = this.getCount(
this.newFeatureLow["Feature ID"],
sampleID
);
return feature_computation.computeBalance(topCt, botCt);
}

Expand Down
2 changes: 1 addition & 1 deletion docs/demos/q2_moving_pictures/main.js

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions docs/demos/red_sea/js/display.js
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
var dataName = samplePlotSpec.data.name;
var sid;
for (var s = 0; s < samplePlotSpec.datasets[dataName].length; s++) {
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"]
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"];
if (sid !== undefined) {
sampleIDs.push(sid);
}
Expand Down Expand Up @@ -1059,8 +1059,14 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
updateBalanceSingle(sampleRow) {
var sampleID = sampleRow["Sample ID"];
this.validateSampleID(sampleID);
var topCt = this.getCount(this.newFeatureHigh["Feature ID"], sampleID);
var botCt = this.getCount(this.newFeatureLow["Feature ID"], sampleID);
var topCt = this.getCount(
this.newFeatureHigh["Feature ID"],
sampleID
);
var botCt = this.getCount(
this.newFeatureLow["Feature ID"],
sampleID
);
return feature_computation.computeBalance(topCt, botCt);
}

Expand Down
12 changes: 9 additions & 3 deletions docs/demos/sleep_apnea/js/display.js
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,7 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
var dataName = samplePlotSpec.data.name;
var sid;
for (var s = 0; s < samplePlotSpec.datasets[dataName].length; s++) {
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"]
sid = samplePlotSpec.datasets[dataName][s]["Sample ID"];
if (sid !== undefined) {
sampleIDs.push(sid);
}
Expand Down Expand Up @@ -1059,8 +1059,14 @@ define(["./feature_computation", "./dom_utils", "vega", "vega-embed"], function(
updateBalanceSingle(sampleRow) {
var sampleID = sampleRow["Sample ID"];
this.validateSampleID(sampleID);
var topCt = this.getCount(this.newFeatureHigh["Feature ID"], sampleID);
var botCt = this.getCount(this.newFeatureLow["Feature ID"], sampleID);
var topCt = this.getCount(
this.newFeatureHigh["Feature ID"],
sampleID
);
var botCt = this.getCount(
this.newFeatureLow["Feature ID"],
sampleID
);
return feature_computation.computeBalance(topCt, botCt);
}

Expand Down
2 changes: 1 addition & 1 deletion docs/demos/sleep_apnea/main.js

Large diffs are not rendered by default.

62 changes: 26 additions & 36 deletions qurro/_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,54 +152,44 @@ def remove_empty_samples_and_features(
"""
logging.debug("Attempting to remove empty samples and features.")

# Filter out empty samples (columns) and features (rows) from the table
# This approach based on https://stackoverflow.com/a/21165116/10730311.
neq_0 = table_sdf != 0
filtered_table = table_sdf.loc[
neq_0.any(axis="columns"), neq_0.any(axis="index")
]

# If the table only contains zeros, then attempting to drop all empty
# samples and/or features would result in a 0x0 DataFrame. Therefore, we
# just raise a ValueError in this case.
if (table_sdf == 0).all().all():
# samples and/or features results in a 0x0 DataFrame. We raise a ValueError
# in this case.
if filtered_table.empty:
raise ValueError("The table is empty.")

# Filter out empty samples
# Basically, we compute each cell in the table table to a bool (True if !=
# 0, False if == 0). Then we just find all the columns (samples) with at
# least one True value, and filter the table to just those columns.
neq_zero = table_sdf != 0
nonempty_samples = []
for sample in table_sdf.columns:
if neq_zero[sample].any():
nonempty_samples.append(sample)

samplefiltered_table = table_sdf.filter(
items=nonempty_samples, axis="columns"
)
filtered_metadata = sample_metadata_df.filter(
items=nonempty_samples, axis="index"
)

# Filter out empty features
# Same method as above, but operating on rows (features) instead of on
# columns (samples).
neq_zero = samplefiltered_table != 0
nonempty_features = []
for feature in samplefiltered_table.index:
if neq_zero.loc[feature].any():
nonempty_features.append(feature)

filtered_table = samplefiltered_table.filter(
items=nonempty_features, axis="index"
)
filtered_ranks = feature_ranks_df.filter(
items=nonempty_features, axis="index"
)

# Let user know about which samples/features may have been dropped, if any.
# And, if we filtered out any samples or features, filter the sample
# metadata and feature ranks (respectively) to match (this is just done by
# aligning them to the filtered table).
filtered_metadata = sample_metadata_df
filtered_ranks = feature_ranks_df

sample_diff = len(table_sdf.columns) - len(filtered_table.columns)
if sample_diff > 0:
# As with match_table_and_data, we have to transpose the sample
# metadata in order to align it with the table (since samples are
# stored as columns in the table but as indices in the sample metadata)
sm_t = sample_metadata_df.T
filtered_metadata = filtered_table.align(
sm_t, join="inner", axis="columns"
)[1].T
logging.debug("Removed {} empty sample(s).".format(sample_diff))
else:
logging.debug("Couldn't find any empty samples.")

feature_diff = len(table_sdf.index) - len(filtered_table.index)
if feature_diff > 0:
filtered_ranks = filtered_table.align(
feature_ranks_df, join="inner", axis="index"
)[1]
logging.debug("Removed {} empty feature(s).".format(feature_diff))
else:
logging.debug("Couldn't find any empty features.")
Expand Down

0 comments on commit 1942df0

Please sign in to comment.