Skip to content

Commit

Permalink
feat(cli): write sorted fasta files
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov committed Sep 4, 2023
1 parent 5b6bc87 commit 1b90f1b
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 3 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions packages_rs/nextclade-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ serde = { version = "=1.0.164", features = ["derive"] }
serde_json = { version = "=1.0.99", features = ["preserve_order", "indexmap", "unbounded_depth"] }
strum = "=0.25.0"
strum_macros = "=0.25"
tinytemplate = "=1.2.1"
url = { version = "=2.4.0", features = ["serde"] }
zip = { version = "=0.6.6", default-features = false, features = ["aes-crypto", "bzip2", "deflate", "time"] }

Expand Down
24 changes: 24 additions & 0 deletions packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,11 +667,35 @@ pub struct NextcladeSeqSortArgs {
///
/// Sequences will be written in subdirectories: one subdirectory per dataset. Sequences inferred to be belonging to a particular dataset wil lbe places in the corresponding subdirectory. The subdirectory tree can be nested, depending on how dataset names are organized.
///
/// Mutually exclusive with `--output`.
///
#[clap(long)]
#[clap(value_hint = ValueHint::DirPath)]
#[clap(hide_long_help = true, hide_short_help = true)]
#[clap(group = "outputs")]
pub output_dir: Option<PathBuf>,

/// Template string for the file path to output sorted sequences. A separate file will be generated per dataset.
///
/// The string should contain template variable `{name}`, where the dataset name will be substituted. Note that if the `{name}` variable contains slashes, they will be interpreted as path segments and subdirectories will be created.
///
/// Make sure you properly quote and/or escape the curly braces, so that your shell, programming language or pipeline manager does not attempt to substitute the variables.
///
/// Mutually exclusive with `--output-dir`.
///
/// If the provided file path ends with one of the supported extensions: "gz", "bz2", "xz", "zst", then the file will be written compressed.
///
/// If the required directory tree does not exist, it will be created.
///
/// Example for bash shell:
///
/// --output='outputs/{name}/sorted.fasta.gz'
#[clap(long)]
#[clap(value_hint = ValueHint::DirPath)]
#[clap(hide_long_help = true, hide_short_help = true)]
#[clap(group = "outputs")]
pub output: Option<String>,

#[clap(flatten, next_help_heading = " Algorithm")]
pub search_params: NextcladeSeqSortParams,

Expand Down
92 changes: 89 additions & 3 deletions packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@ use crate::dataset::dataset_download::download_datasets_index_json;
use crate::io::http_client::HttpClient;
use eyre::{Report, WrapErr};
use itertools::Itertools;
use log::{info, LevelFilter};
use nextclade::io::fasta::{FastaReader, FastaRecord};
use log::{info, trace, LevelFilter};
use nextclade::io::fasta::{FastaReader, FastaRecord, FastaWriter};
use nextclade::make_error;
use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION};
use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchResult};
use nextclade::sort::params::NextcladeSeqSortParams;
use nextclade::utils::string::truncate;
use serde::Serialize;
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::str::FromStr;
use tinytemplate::TinyTemplate;

#[derive(Debug, Clone)]
struct MinimizerSearchRecord {
Expand All @@ -18,6 +22,8 @@ struct MinimizerSearchRecord {
}

pub fn nextclade_seq_sort(args: &NextcladeSeqSortArgs) -> Result<(), Report> {
check_args(args)?;

let NextcladeSeqSortArgs {
server,
proxy_config,
Expand Down Expand Up @@ -66,6 +72,7 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) ->
let NextcladeSeqSortArgs {
input_fastas,
output_dir,
output,
search_params,
other_params: NextcladeRunOtherParams { jobs },
..
Expand Down Expand Up @@ -122,11 +129,55 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) ->
}

let writer = s.spawn(move || {
let output_dir = &output_dir;
let output = &output;

let tt = output.as_ref().map(move |output| {
let mut tt = TinyTemplate::new();
tt.add_template("output", output)
.wrap_err_with(|| format!("When parsing template: {output}"))
.unwrap();
tt
});

println!(
"{:40} | {:40} | {:10} | {:10}",
"Seq. name", "dataset", "total hits", "max hit"
);

let mut writers = BTreeMap::new();

for record in result_receiver {
if let Some(name) = &record.result.dataset {
let filepath = match (&tt, output_dir) {
(Some(tt), None) => {
let filepath_str = tt
.render("output", &OutputTemplateContext { name })
.wrap_err("When rendering output path template")
.unwrap();

Some(
PathBuf::from_str(&filepath_str)
.wrap_err_with(|| format!("Invalid output translations path: '{filepath_str}'"))
.unwrap(),
)
}
(None, Some(output_dir)) => Some(output_dir.join(name).join("sequences.fasta")),
_ => None,
};

if let Some(filepath) = filepath {
let writer = writers.entry(filepath.clone()).or_insert_with(|| {
trace!("Creating fasta writer to file {filepath:#?}");
FastaWriter::from_path(filepath).unwrap()
});

writer
.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)
.unwrap();
}
}

println!(
"{:40} | {:40} | {:>10} | {:>.3}",
&truncate(record.fasta_record.seq_name, 40),
Expand All @@ -140,3 +191,38 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) ->

Ok(())
}

#[derive(Serialize)]
struct OutputTemplateContext<'a> {
name: &'a str,
}

fn check_args(args: &NextcladeSeqSortArgs) -> Result<(), Report> {
let NextcladeSeqSortArgs { output_dir, output, .. } = args;

if output.is_some() && output_dir.is_some() {
return make_error!(
"The arguments `--output-dir` and `--output` cannot be used together. Remove one or the other."
);
}

if let Some(output) = output {
if !output.contains("{name}") {
return make_error!(
r#"
Expected `--output` argument to contain a template string containing template variable {{name}} (with curly braces), but received:
{output}
Make sure the variable is not substituted by your shell, programming language or workflow manager. Apply proper escaping as needed.
Example for bash shell:
--output='outputs/{{name}}/sorted.fasta.gz'
"#
);
}
}

Ok(())
}

0 comments on commit 1b90f1b

Please sign in to comment.