Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WaveICA: Added boolean parameter to output feature table and metadata table separately #555

Merged
merged 7 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion tools/waveica/macros.xml
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,20 @@
</when>
</conditional>
</xml>
<xml name="split_output">
<param name = "keep_two_output" label="Keep Two Output in recetox-aplcms format" type="boolean" checked="false"
hechth marked this conversation as resolved.
Show resolved Hide resolved
truevalue="TRUE" falsevalue="FALSE" help="keep two outputs, one is feature table and second is metadata table" />
hechth marked this conversation as resolved.
Show resolved Hide resolved
</xml>

<xml name="outputs">
<outputs>
<data name="normalized_data" format="tsv">
<data name="normalized_data" format="tsv" label="Normalized table of ${on_string}">
hechth marked this conversation as resolved.
Show resolved Hide resolved
<change_format>
<when input_dataset="data" attribute="ext" value="parquet" format="parquet" />
</change_format>
</data>
<data name="metadata" format="tsv" label="Metadata table of ${on_string}">
<filter>keep_two_output</filter>
<change_format>
<when input_dataset="data" attribute="ext" value="parquet" format="parquet" />
</change_format>
Expand Down
5 changes: 5 additions & 0 deletions tools/waveica/test-data/test10_output1.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id VT_160120_002 VT_160120_004 VT_160120_006 VT_160120_008 VT_160120_010
M85T34 355200.506508035 216897.826587868 362337.195084504 143303.377379009 189065.516447239
M86T41 75115889.9077485 75204863.1495248 76490295.1450204 83771659.9549148 84108898.7658797
M86T518 6101488.54615418 6170882.26270475 12588041.969092 6181538.46316058 6103964.42378424
M86T539 2007379.02604984 2069979.64992079 1818589.63912375 1975712.25920485 1935671.32085241
6 changes: 6 additions & 0 deletions tools/waveica/test-data/test10_output2.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sampleName class sampleType injectionOrder batch
VT_160120_002 sample sample 1 1
VT_160120_004 sample sample 2 1
VT_160120_006 sample sample 3 1
VT_160120_008 sample sample 4 1
VT_160120_010 sample sample 5 1
Binary file added tools/waveica/test-data/test9_output1.parquet
Binary file not shown.
Binary file added tools/waveica/test-data/test9_output2.parquet
Binary file not shown.
61 changes: 47 additions & 14 deletions tools/waveica/waveica.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="waveica" name="WaveICA" version="@TOOL_VERSION@+galaxy6" profile="21.09">
<tool id="waveica" name="WaveICA" version="@TOOL_VERSION@+galaxy7" profile="21.09">
<description>removal of batch effects for untargeted metabolomics data</description>
<macros>
<import>macros.xml</import>
Expand Down Expand Up @@ -51,14 +51,14 @@
)'
#end if

-e 'store_data(normalized_data, "$normalized_data", "$input_num.data.ext")'
-e 'store_data(normalized_data, "$normalized_data", "$metadata", "$input_num.data.ext", $keep_two_output)'
]]></command>

<inputs>
<conditional name="input_num">
<param name="input_choice" type="select" label="Choose input files:">
<option value="1" selected="true">1</option>
<option value="2">2</option>
<option value="1" selected="true">1: intensity-by-feature table with metadata</option>
<option value="2">2: intensity-by-feature table and metadata table separately</option>
</param>
<when value="1">
<expand macro="input_data"/>
Expand Down Expand Up @@ -89,12 +89,13 @@
</when>
</conditional>
<expand macro="exclude_blanks"/>
<expand macro="split_output"/>
</inputs>

<expand macro="outputs"/>

<tests>
<test><!-- TEST 1 -->
<test expect_num_outputs="1"><!-- TEST 1 -->
<param name="data" value="input_data.csv" ftype="csv"/>
<param name="mode" value="batchwise"/>
<param name="wavelet_filter" value="d"/>
Expand All @@ -105,7 +106,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.tsv" ftype="tsv"/>
</test>
<test><!-- TEST 2 -->
<test expect_num_outputs="1"><!-- TEST 2 -->
<param name="data" value="input_data.tsv" ftype="tsv"/>
<param name="mode" value="batchwise"/>
<param name="wavelet_filter" value="d"/>
Expand All @@ -116,7 +117,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.tsv" ftype="tsv"/>
</test>
<test><!-- TEST 3 -->
<test expect_num_outputs="1"><!-- TEST 3 -->
<param name="data" value="input_data.parquet" ftype="parquet"/>
<param name="mode" value="batchwise"/>
<param name="wavelet_filter" value="d"/>
Expand All @@ -127,7 +128,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.parquet" ftype="parquet"/>
</test>
<test><!-- TEST 4 -->
<test expect_num_outputs="1"><!-- TEST 4 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table.csv" ftype="csv"/>
<param name="metadata" value="metadata.csv" ftype="csv"/>
Expand All @@ -140,7 +141,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.tsv" ftype="tsv"/>
</test>
<test><!-- TEST 5 -->
<test expect_num_outputs="1"><!-- TEST 5 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table.tsv" ftype="tsv"/>
<param name="metadata" value="metadata.tsv" ftype="tsv"/>
Expand All @@ -153,7 +154,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.tsv" ftype="tsv"/>
</test>
<test><!-- TEST 6 -->
<test expect_num_outputs="1"><!-- TEST 6 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table.parquet" ftype="parquet"/>
<param name="metadata" value="metadata.csv" ftype="csv"/>
Expand All @@ -166,7 +167,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.parquet" compare="sim_size" delta="200" ftype="parquet"/>
</test>
<test><!-- TEST 7 -->
<test expect_num_outputs="1"><!-- TEST 7 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table_transpose_version.parquet" ftype="parquet"/>
<param name="metadata" value="metadata.parquet" ftype="parquet"/>
Expand All @@ -180,7 +181,7 @@
<param name="alpha" value="0"/>
<output name="normalized_data" file="normalized_data.parquet" compare="sim_size" delta="200" ftype="parquet"/>
</test>
<test><!-- TEST 8 -->
<test expect_num_outputs="1"><!-- TEST 8 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table_transpose_version.csv" ftype="csv"/>
<param name="metadata" value="metadata.csv" ftype="csv"/>
Expand All @@ -205,10 +206,42 @@
<param name="cutoff" value="0"/>
<output name="normalized_data" file="normalized_data_nobatch.tsv"/>
</test> -->
<test expect_failure="true">
<test expect_num_outputs="2"><!-- TEST 9 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table_transpose_version.parquet" ftype="parquet"/>
<param name="metadata" value="metadata.parquet" ftype="parquet"/>
<param name="transpose_feature_table" value="TRUE"/>
<param name="mode" value="batchwise"/>
<param name="wavelet_filter" value="d"/>
<param name="wavelet_length" value="2"/>
<param name="k" value="20"/>
<param name="t" value="0.05"/>
<param name="t2" value="0.05"/>
<param name="alpha" value="0"/>
<param name="keep_two_output" value="TRUE"/>
<output name="normalized_data" file="test9_output1.parquet" ftype="parquet"/>
<output name="metadata" file="test9_output2.parquet" ftype="parquet"/>
</test>
<test expect_num_outputs="2"><!-- TEST 10 -->
<param name="input_choice" value="2"/>
<param name="data" value="feature_table_transpose_version.csv" ftype="csv"/>
<param name="metadata" value="metadata.csv" ftype="csv"/>
<param name="transpose_feature_table" value="TRUE"/>
<param name="mode" value="batchwise"/>
<param name="wavelet_filter" value="d"/>
<param name="wavelet_length" value="2"/>
<param name="k" value="20"/>
<param name="t" value="0.05"/>
<param name="t2" value="0.05"/>
<param name="alpha" value="0"/>
<param name="keep_two_output" value="TRUE"/>
<output name="normalized_data" file="test10_output1.tsv" ftype="tsv"/>
<output name="metadata" file="test10_output2.tsv" ftype="tsv"/>
</test>
hechth marked this conversation as resolved.
Show resolved Hide resolved
<test expect_failure="true"><!-- TEST 11 -->
<param name="data" value="na_data.csv" ftype="csv"/>
</test>
<test expect_failure="true">
<test expect_failure="true"><!-- TEST 12 -->
<param name="data" value="incomplete_metadata_data.csv" ftype="csv"/>
</test>
</tests>
Expand Down
67 changes: 52 additions & 15 deletions tools/waveica/waveica_wrapper.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ read_file <- function(file, metadata, ft_ext, mt_ext, transpose) {

if (transpose) {
col_names <- c("sampleName", data[[1]])
t_data <- data[-1]
t_data <- t(t_data)
data <- data.frame(rownames(t_data), t_data)
colnames(data) <- col_names
data <- tranpose_data(data, col_names)
}

if (!is.na(metadata)) {
Expand Down Expand Up @@ -133,7 +130,6 @@ waveica_singlebatch <- function(file,
return(data)
}


sort_by_injection_order <- function(data) {
if ("batch" %in% colnames(data)) {
data <- data[order(data[, "batch"], data[, "injectionOrder"], decreasing = FALSE), ]
Expand All @@ -143,7 +139,6 @@ sort_by_injection_order <- function(data) {
return(data)
}


verify_input_dataframe <- function(data, required_columns) {
if (anyNA(data)) {
stop("Error: dataframe cannot contain NULL values!
Expand Down Expand Up @@ -194,7 +189,6 @@ verify_column_types <- function(data, required_columns) {
return(data)
}


# Match group labels with [blank/sample/qc] and enumerate them
enumerate_groups <- function(group) {
group[grepl("blank", tolower(group))] <- 0
Expand All @@ -204,7 +198,6 @@ enumerate_groups <- function(group) {
return(group)
}


# Create appropriate input for R wavelets function
get_wf <- function(wavelet_filter, wavelet_length) {
wf <- paste(wavelet_filter, wavelet_length, sep = "")
Expand All @@ -217,7 +210,6 @@ get_wf <- function(wavelet_filter, wavelet_length) {
return(wf)
}


# Exclude blanks from a dataframe
exclude_group <- function(data, group) {
row_idx_to_exclude <- which(group %in% 0)
Expand All @@ -230,14 +222,59 @@ exclude_group <- function(data, group) {
}
}

store_data <- function(data, output, ext) {
store_data <- function(data, feature_output, metadata_output, ext, split_output = FALSE) {
if (ext == "parquet") {
arrow::write_parquet(data, output)
if (split_output == TRUE) {
split_df <- split_output(data)
arrow::write_parquet(split_df$metadata, metadata_output)
arrow::write_parquet(split_df$feature_table, feature_output)
} else {
arrow::write_parquet(data, feature_output)
}
} else {
write.table(data,
file = output, sep = "\t",
row.names = FALSE, quote = FALSE
)
if (split_output == TRUE) {
split_df <- split_output(data)
write.table(split_df$metadata, file = metadata_output, sep = "\t",
row.names = FALSE, quote = FALSE
)
write.table(split_df$feature_table, file = feature_output, sep = "\t",
row.names = FALSE, quote = FALSE
)
} else {
write.table(data, file = feature_output, sep = "\t",
row.names = FALSE, quote = FALSE
)
}
}
cat("Normalization has been completed.\n")
}

split_output <- function(df) {
required_columns_set1 <- c("sampleName", "class", "sampleType", "injectionOrder", "batch")
required_columns_set2 <- c("sampleName", "class", "sampleType", "injectionOrder")

if (all(required_columns_set1 %in% colnames(df))) {
metadata_df <- df[, required_columns_set1, drop = FALSE]
df <- df[, -c(2:5)]
} else if (all(required_columns_set2 %in% colnames(df))) {
metadata_df <- df[, required_columns_set2, drop = FALSE]
df <- df[, -c(2:4)]
} else {
stop("Neither set of required columns is present in the dataframe.")
}

# Transpose the feature table
col_names <- c("id", as.vector(df[[1]]))
feature_table <- tranpose_data(df, col_names)

return(list(metadata = metadata_df, feature_table = feature_table))
}

tranpose_data <- function(data, column_names) {
t_data <- data[-1]
t_data <- t(t_data)
tranposed_data <- data.frame(rownames(t_data), t_data)
colnames(tranposed_data) <- column_names

return(tranposed_data)
}
Loading