Skip to content

Commit

Permalink
Adding rules to aggregate blast results into a spreadsheet
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Nov 28, 2023
1 parent c3e1588 commit 7684769
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 5 deletions.
5 changes: 5 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,17 @@ rule all:
join(workpath,"{name}","output","{name}.metaspades_blast.tsv"),
name=pe_samples
),
provided(
[join(workpath,"Project","metaspades_blast.xlsx")],
pe_samples
),
# BLAST Megahit contigs against nt virsuses database,
# @imported from rules/paired-end.smk
expand(
join(workpath,"{name}","output","{name}.megahit_blast.tsv"),
name=samples
),
join(workpath,"Project","megahit_blast.xlsx"),
# Krona, taxonomic classification report
# @imported from rules/paired-end.smk
expand(
Expand Down
112 changes: 109 additions & 3 deletions workflow/rules/paired-end.smk
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@ rule blast_metaspades_contigs:
output:
blast=join(workpath,"{name}","output","{name}.metaspades_blast.tsv"),
params:
rname='blastmetaspadest',
rname='blastmetaspades',
blast_db=config['references']['blast_viral_db'],
header=join(workpath,"{name}","output","{name}.metaspades_blast.header.tsv"),
tmp=join(workpath,"{name}","output","{name}.metaspades_blast.tmp.tsv"),
Expand All @@ -717,7 +717,7 @@ rule blast_metaspades_contigs:
container: config['images']['blast']
shell: """
# BLAST metaspades contigs against viral database
echo -e "#{params.outfmt_tabs}" \\
echo -e "{params.outfmt_tabs}" \\
> {params.header}
blastn -query {input.contigs} \\
-db {params.blast_db} \\
Expand All @@ -733,6 +733,59 @@ rule blast_metaspades_contigs:
"""


rule blast_metaspades_xlsx:
"""
Data-processing step to aggregate the blast-ed metaspades contigs results into
a single XLSX file. Within the output XLSX file, there will be a tab for each
sample's blast results. The first tab in the spreadsheet will contain concat
blast results across all samples.
@Input:
Blast texts file containing alignment results of metaspades contigs (gather)
@Output:
Blast XLSX file containing alignment results of metaspades contigs for all samples
"""
input:
blasts=expand(join(workpath,"{name}","output","{name}.metaspades_blast.tsv"), name=pe_samples),
output:
tsv=join(workpath,"Project","metaspades_blast.tsv"),
excel=join(workpath,"Project","metaspades_blast.xlsx"),
params:
rname='blastmetaspadesxlsx',
outfmt_tabs='\\t'.join([
'sample', 'qaccver', 'saccver', 'staxid',
'ssciname', 'scomname', 'sblastname',
'sscinames', 'stitle', 'pident', 'qlen',
'length', 'mismatch', 'gapopen', 'qstart',
'qend', 'sstart', 'send', 'evalue', 'bitscore'
]),
extension=".metaspades_blast.tsv",
script=join(workpath, "workflow", "scripts", "file2spreadsheet.py"),
threads: int(allocated("threads", "blast_metaspades_xlsx", cluster))
container: config['images']['blast']
shell: """
# Create aggregated single file
# results of BLAST-ed metaspades
# contigs against viral database
echo -e "{params.outfmt_tabs}" \\
> {output.tsv}
# Adding a column to the output
# that contains each sample name
awk -F '\\t' -v OFS='\\t' \\
'FNR>1 {{sub("{params.extension}", "", FILENAME); print FILENAME, $0}}' \\
{input.blasts} \\
> {output.tsv}
# Create an excel spreadsheet
# containing the blast results
# of aligning the metaspades
# contigs against NCBI viral db
{params.script} \\
--rm-suffix "{params.extension}" \\
--input {output.tsv} {input.blasts} \\
--output {output.excel}
"""


rule blast_megahit_contigs:
"""
Data-processing step to blast assembled contigs against nt virsuses database.
Expand Down Expand Up @@ -772,7 +825,7 @@ rule blast_megahit_contigs:
container: config['images']['blast']
shell: """
# BLAST megahit contigs against viral database
echo -e "#{params.outfmt_tabs}" \\
echo -e "{params.outfmt_tabs}" \\
> {params.header}
blastn -query {input.contigs} \\
-db {params.blast_db} \\
Expand All @@ -788,6 +841,59 @@ rule blast_megahit_contigs:
"""


rule blast_megahit_xlsx:
"""
Data-processing step to aggregate the blast-ed megahit contigs results into
a single XLSX file. Within the output XLSX file, there will be a tab for each
sample's blast results. The first tab in the spreadsheet will contain concat
blast results across all samples.
@Input:
Blast texts file containing alignment results of megahit contigs (gather)
@Output:
Blast XLSX file containing alignment results of megahit contigs for all samples
"""
input:
blasts=expand(join(workpath,"{name}","output","{name}.megahit_blast.tsv"), name=samples),
output:
tsv=join(workpath,"Project","megahit_blast.tsv"),
excel=join(workpath,"Project","megahit_blast.xlsx"),
params:
rname='blastmegahitxlsx',
outfmt_tabs='\\t'.join([
'sample', 'qaccver', 'saccver', 'staxid',
'ssciname', 'scomname', 'sblastname',
'sscinames', 'stitle', 'pident', 'qlen',
'length', 'mismatch', 'gapopen', 'qstart',
'qend', 'sstart', 'send', 'evalue', 'bitscore'
]),
extension=".megahit_blast.tsv",
script=join(workpath, "workflow", "scripts", "file2spreadsheet.py"),
threads: int(allocated("threads", "blast_megahit_xlsx", cluster))
container: config['images']['blast']
shell: """
# Create aggregated single file
# results of BLAST-ed megahit
# contigs against viral database
echo -e "{params.outfmt_tabs}" \\
> {output.tsv}
# Adding a column to the output
# that contains each sample name
awk -F '\\t' -v OFS='\\t' \\
'FNR>1 {{sub("{params.extension}", "", FILENAME); print FILENAME, $0}}' \\
{input.blasts} \\
> {output.tsv}
# Create an excel spreadsheet
# containing the blast results
# of aligning the megahit
# contigs against NCBI viral db
{params.script} \\
--rm-suffix "{params.extension}" \\
--input {output.tsv} {input.blasts} \\
--output {output.excel}
"""


rule krona:
"""
Reporting step to create a interactive Krona charts based on
Expand Down
4 changes: 2 additions & 2 deletions workflow/scripts/files2spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,9 @@ def excel_writer(files, spreadsheet, skip_comments=None, remove_suffix = ''):
# additional suffix to remove
# we must take into account the
# extension has already been
# removed prior to
# removed, see above
cleaned_suffix, ext = os.path.splitext(remove_suffix)
sheet = sheet.split(cleaned_suffix)[0]
sheet = sheet.rsplit(cleaned_suffix, 1)[0]

# Sheet name cannot exceed
# 31 characters in length
Expand Down

0 comments on commit 7684769

Please sign in to comment.