-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Deploy for release 0.3.3 from 4fef684
- Loading branch information
Showing
55 changed files
with
1,756 additions
and
1,033 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
"""This Module is used to convert MaxQuant output to anndata""" | ||
import pandas as pd | ||
import anndata as ad | ||
|
||
## VIASH START | ||
par = { | ||
"input": "resources_test/zenodo_4274987/maxquant_out", | ||
"output": "resources_test/zenodo_4274987/maxquant_out/output.h5ad", | ||
} | ||
## VIASH END | ||
|
||
|
||
# helper function for transforming column names in proteingroups | ||
# to snakecase | ||
def fix_headers(dataframe_old:pd.DataFrame)->pd.DataFrame: | ||
"""Fixes the headers by unescaping and converting to | ||
snakecase and replacing booleans with integers""" | ||
dataframe = dataframe_old.copy(deep=True) | ||
|
||
dataframe.columns = dataframe.columns.str.lower() | ||
|
||
replaces={ "+":("and",False), | ||
"%":("and",False), | ||
" ":("and",False), | ||
"[^a-z0-9_]*":("",True) | ||
} | ||
|
||
for old, (new,use_regex) in replaces.items(): | ||
dataframe.columns = dataframe.columns.str.replace(old, new, regex=use_regex) | ||
#TODO figure out which are causing the issues | ||
for column_name,column in dataframe.items(): | ||
print(f"Removing booleans for : {column_name} ") | ||
column.replace([False, True], [0, 1]) | ||
return dataframe | ||
|
||
# helper function to collate layer data from proteingroups | ||
def get_layer_data(_protein_groups:pd.DataFrame, template:str, | ||
sample_ids:pd.DataFrame)->pd.DataFrame: | ||
"""Retrieves data for the protein group layers""" | ||
headers = [] | ||
for sample_id in sample_ids: | ||
headers.append(template.format(sample_id=sample_id)) | ||
dataframe = _protein_groups.loc[:, _protein_groups.columns.isin(headers)] | ||
dataframe.columns = sample_ids | ||
dataframe = dataframe.transpose() | ||
return dataframe | ||
|
||
# read sample metadata | ||
summary = pd.read_table(f"{par['input']}/combined/txt/summary.txt") | ||
# this is the only working, confirmed way | ||
summary_nt = summary[summary["Raw file"].str.contains("Total")==False] | ||
# read protein group info | ||
protein_groups = pd.read_table(f"{par['input']}/combined/txt/proteinGroups.txt") | ||
|
||
# use hardcoded templates | ||
#TODO evaluate strategy (alternative = dynamically load column headers) | ||
templates = { | ||
"peptides": "Peptides {sample_id}", | ||
"razor_and_unique_peptides": "Razor + unique peptides {sample_id}", | ||
"unique_peptides": "Unique peptides {sample_id}", | ||
"sequence_coverage": "Sequence coverage {sample_id} [%]", | ||
"intensity": "Intensity {sample_id}", | ||
} | ||
|
||
# The sample IDs | ||
sampleIDs = summary_nt.loc[:, "Experiment"] | ||
|
||
layers = {} | ||
for key, value in templates.items(): | ||
x = get_layer_data(protein_groups, value, sampleIDs) | ||
layers[key] = x | ||
|
||
# set sample metadata as observations | ||
obs=fix_headers(summary_nt) | ||
# set protein identifications as metadata | ||
var = fix_headers(protein_groups) | ||
|
||
# Create an AnnData object | ||
adata = ad.AnnData(None, obs, var) | ||
for key, value in layers.items(): | ||
adata.layers[key] = layers[key] | ||
|
||
# Export data to file... | ||
adata.write_h5ad(par["output"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from unittest import main, TestCase | ||
import subprocess | ||
from pathlib import Path | ||
import anndata as ad | ||
|
||
## VIASH START | ||
meta = { | ||
'executable': './target/docker/convert/maxquant_to_h5ad', | ||
'resources_dir': './resources_test/zenodo_4274987', | ||
} | ||
## VIASH END | ||
|
||
target ="output.h5ad" | ||
resources_dir, executable = meta["resources_dir"], meta["executable"] | ||
conversion_output = f"{resources_dir}/{target}]" | ||
|
||
class TestMaxQuantToHAD(TestCase): | ||
def _run_and_check_output(self, args_as_list, expected_raise=False): | ||
try: | ||
subprocess.check_output([meta['executable']] + args_as_list, stderr=subprocess.STDOUT) | ||
except subprocess.CalledProcessError as e: | ||
if not expected_raise: | ||
print(e.stdout.decode("utf-8")) | ||
raise e | ||
|
||
def test_maxquant_convert(self): | ||
self._run_and_check_output(["--input", "zenodo_4274987/maxquant_out", | ||
"--output", target]) | ||
self.assertTrue(Path(target).is_file()) | ||
converted_data = ad.read_h5ad(target) | ||
|
||
#Check if the specified data layers are present in the output | ||
self.assertListEqual(list(converted_data.layers),['intensity', 'peptides', 'razor_and_unique_peptides', 'sequence_coverage', 'unique_peptides']) | ||
#Check the number of observations (should be 2: Sample1 & Sample2) | ||
self.assertEqual(len(converted_data.obs),2) | ||
#Check the number of metadata columns (proteingroups), should be 270 in the given example | ||
self.assertEqual(len(converted_data.var),270) | ||
#TODO is it worth testing further in depth? | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.