Skip to content

Commit

Permalink
MRG: enable loading lineages from annotated gather with match_name in…
Browse files Browse the repository at this point in the history
…stead of name (#3078)

This PR enables loading from gather lineages files that contain
'match_name' instead of name. Rationale:
- we generally plan to replace 'name' with 'match_name' in gather output
(#1555)
- branchwater plugin's fastmultigather already uses 'match_name'

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
bluegenes and pre-commit-ci[bot] authored Mar 12, 2024
1 parent 283b60b commit 69edb60
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 5 deletions.
15 changes: 11 additions & 4 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1274,7 +1274,9 @@ def load(
elif "accession" in header:
identifier = "accession"
header = ["ident" if "accession" == x else x for x in header]
elif "name" in header and "lineage" in header:
elif "lineage" in header and any(
["name" in header, "match_name" in header]
):
return cls.load_from_gather_with_lineages(
filename, force=force, lins=lins, ictv=ictv
)
Expand Down Expand Up @@ -1390,9 +1392,14 @@ def load_from_gather_with_lineages(
if not header:
raise ValueError(f"cannot read taxonomy assignments from {filename}")

if "name" not in header or "lineage" not in header:
ident_col = None
if "name" in header:
ident_col = "name"
elif "match_name" in header:
ident_col = "match_name"
if "lineage" not in header or ident_col is None:
raise ValueError(
"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?"
"Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?"
)

ranks = None
Expand All @@ -1405,7 +1412,7 @@ def load_from_gather_with_lineages(
for n, row in enumerate(r):
num_rows += 1

name = row["name"]
name = row[ident_col]
ident = get_ident(name)

if lins:
Expand Down
5 changes: 5 additions & 0 deletions tests/test-data/tax/test1.gather.with-lineages.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes,lineage
442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Pseudomonadota;Gammaproteobacteria;Enterobacterales;Enterobacteriaceae;Escherichia;Escherichia coli
390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri
138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Bacteroidaceae;Phocaeicola;Phocaeicola vulgatus
338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,5014000,31,1000,2507,Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri
25 changes: 24 additions & 1 deletion tests/test_tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@ def test_check_and_load_gather_lineage_csvs_bad_header(runtmp):
with pytest.raises(ValueError) as exc:
LineageDB.load_from_gather_with_lineages(g_res)
assert (
"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?"
"Expected headers 'name'/'match_name' and 'lineage' not found. Is this a with-lineages file?"
in str(exc.value)
)

Expand All @@ -1038,6 +1038,29 @@ def test_check_and_load_gather_lineage_csvs_isdir(runtmp):
assert "is a directory" in str(exc.value)


def test_check_and_load_gather_lineage_csvs_name(runtmp):
# test loading a with-lineage file that has 'name', not 'match_name'
g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv")

lins = LineageDB.load_from_gather_with_lineages(g_res)
assert len(lins) == 4


def test_check_and_load_gather_lineage_csvs_match_name(runtmp):
# test loading a with-lineage file that has 'match_name' instead of 'name'
g_res = utils.get_test_data("tax/test1.gather.with-lineages.csv")
out_lins = runtmp.output("match-name.lineages.csv")
with open(g_res) as f_in:
first_line = f_in.readline().replace("name", "match_name")
with open(out_lins, "w") as f_out:
f_out.write(first_line)
for line in f_in:
f_out.write(line)

lins = LineageDB.load_from_gather_with_lineages(out_lins)
assert len(lins) == 4


def test_check_and_load_gather_csvs_fail_on_missing(runtmp):
g_csv = utils.get_test_data("tax/test1.gather.csv")
# make gather results with taxonomy name not in tax_assign
Expand Down

0 comments on commit 69edb60

Please sign in to comment.