From 35ef42580f44b97039718f356a7fa52b1ec84d61 Mon Sep 17 00:00:00 2001 From: markschl Date: Mon, 16 Sep 2024 15:12:48 +0200 Subject: [PATCH] Add comparison of tools & adjust docs * Added some scripts for profiling seqtool and other tools * Adjusted some docs (see also seqtool-docs repo) --- Cargo.toml | 2 +- README.md | 151 ++++++++------- profile/README.md | 79 ++++++++ profile/comparison_commands.yaml | 305 +++++++++++++++++++++++++++++++ profile/fastq_urls.txt | 15 ++ scripts/compare_tools.py | 185 +++++++++++++++++++ scripts/summarize_comparison.py | 87 +++++++++ src/cli.rs | 3 +- src/cmd/count.rs | 15 +- src/cmd/find/vars.rs | 10 +- src/cmd/mask.rs | 8 +- src/cmd/trim.rs | 8 +- var_provider/src/var_provider.rs | 6 +- wix/main.wxs | 2 +- 14 files changed, 779 insertions(+), 97 deletions(-) create mode 100644 profile/README.md create mode 100644 profile/comparison_commands.yaml create mode 100644 profile/fastq_urls.txt create mode 100644 scripts/compare_tools.py create mode 100644 scripts/summarize_comparison.py diff --git a/Cargo.toml b/Cargo.toml index e4da5a1..7572f3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "seqtool" version = "0.4.0-beta.2" edition = "2021" authors = ["Markus Schlegel "] -description = "General purpose tool for reading, modifying and writing biological sequences." +description = "General-purpose tool for reading, modifying and writing biological sequences." license = "MIT" repository = "https://github.com/markschl/seqtool" readme = "README.md" diff --git a/README.md b/README.md index 0132d8f..dfbfabe 100755 --- a/README.md +++ b/README.md @@ -1,55 +1,61 @@ **Seqtool** is a fast and flexible command line program for dealing with large amounts of biological sequences. -It provides different subcommands for *converting*, *inspecting* and *modifying* -sequences. +It provides different [subcommands](#commands) for *converting*, *inspecting* +and *modifying* sequences. The standalone binary (~6 MB) is simply named `st` to save some typing. +[![CI](https://github.com/markschl/seqtool/actions/workflows/ci.yaml/badge.svg)](https://github.com/markschl/seqtool/actions/workflows/ci.yaml) + > **Note:** this page describes the development version 0.4-beta. > **The older stable version (v0.3.0) is [documented here](https://github.com/markschl/seqtool/wiki).** -> ⚠ Also note that **there are some bugs in v0.3.0**, -> see [CHANGELOG](https://github.com/markschl/seqtool/blob/main/CHANGELOG.md#important-bugfixes-). -> Alternatively, v0.4.0-beta should be pretty safe to use already. -**[📥 download stable release (v0.3.0)](https://github.com/markschl/seqtool/releases/latest)** +## Detailed documentation -**[📥 download beta release (v0.4.0-beta)](https://github.com/markschl/seqtool/releases/tag/0.4.0-beta.2)** +**See [this page](https://markschl.github.io/seqtool-docs)** -[![CI](https://github.com/markschl/seqtool/actions/workflows/ci.yaml/badge.svg)](https://github.com/markschl/seqtool/actions/workflows/ci.yaml) +## Downloads +**[📥 download stable release (v0.3.0)](https://github.com/markschl/seqtool/releases/latest)** -## Detailed documentation +> **⚠ Note**: there are a few **unfixed bugs in v0.3.0** (currently) +> when reading GZIP files or searching/replacing; +> see [CHANGELOG for v0.4.0-beta](https://github.com/markschl/seqtool/blob/main/CHANGELOG.md#important-bugfixes-). +> Alternatively, consider using v0.4.0-beta. -**See [this page](https://markschl.github.io/seqtool-docs)** +**[📥 download beta release (v0.4.0-beta)](https://github.com/markschl/seqtool/releases/tag/0.4.0-beta.2)** + +> Should be pretty safe to use despite considerable refactoring. +> Approximate matching (https://markschl.github.io/seqtool-docs/[find](find) command) is yet to be fully tested. ## Feature overview ### File formats -[**Reads** and **writes**](https://markschl.github.io/seqtool-docs/pass) **FASTA, FASTQ** and **CSV/TSV**, optionally compressed +[**Reads** and **writes**](https://markschl.github.io/seqtool-docs/formats) **FASTA, FASTQ** and **CSV/TSV**, optionally compressed with [GZIP](https://en.wikipedia.org/wiki/Gzip), [BZIP2](https://en.wikipedia.org/wiki/Bzip2), or the faster and more modern [Zstandard](http://facebook.github.io/zstd/) or [LZ4](https://lz4.org/) formats -
+
Example: compressed FASTQ to FASTA Combine multiple compressed FASTQ files, converting them to FASTA, using [pass](https://markschl.github.io/seqtool-docs/pass). -> **Note**: almost every command can read multiple input files and convert between formats, -> but *pass* does nothing other than reading and writing while other command perform certain actions. - -```sh +```bash st pass file1.fastq.gz file2.fastq.gz -o output.fasta ``` +> **Note**: almost every command can read multiple input files and convert between formats, +> but *pass* does nothing other than reading and writing while other command perform certain actions. +
-
+
Example: FASTA to tab-separated list @@ -57,7 +63,7 @@ Example: FASTA to tab-separated list Aside from ID and sequence, any [variable/function](https://markschl.github.io/seqtool-docs/variables) such as the sequence length (`seqlen`) can be written to delimited text. -```sh +```bash st pass input.fasta --to-tsv id,seq,seqlen ``` @@ -70,23 +76,16 @@ id1 ACGTA 5
-### Commands for many different tasks - -([see list below](#commands)) - -### Highly versatile +### Highly versatile thanks to variables/functions -... thanks to **[variables/functions](https://markschl.github.io/seqtool-docs/variables)** +See also **[variables/functions](https://markschl.github.io/seqtool-docs/variables)** for more details. -
+
Example: count sequences in a large set of FASTQ files -In [count](https://markschl.github.io/seqtool-docs/count), one or several categorical [variables/functions](https://markschl.github.io/seqtool-docs/variables) -can be specified with `-k/--key`. - -```sh +```bash st count -k path data/*.fastq.gz ``` @@ -99,9 +98,12 @@ data/sample5.fastq.gz 7021 (...) ``` +> In [count](https://markschl.github.io/seqtool-docs/count), one or several categorical [variables/functions](https://markschl.github.io/seqtool-docs/variables) +> can be specified with `-k/--key`. +
-
+
Example: summarize the GC content in 10% intervals @@ -109,7 +111,7 @@ Example: summarize the GC content in 10% intervals The function `bin(variable, interval)` groups continuous numeric values into intervals -```sh +```bash st count -k 'bin(gc_percent, 10)' sequences.fasta ``` @@ -123,12 +125,12 @@ st count -k 'bin(gc_percent, 10)' sequences.fasta
-
+
Example: Assign new sequence IDs -```sh +```bash st set -i 'seq_{num}' seqs.fasta > renamed.fasta ``` @@ -144,7 +146,7 @@ SEQUENCE
-
+
Example: De-replicate by description and sequence @@ -164,7 +166,7 @@ SEQUENCE1 SEQUENCE1 ``` -```sh +```bash st unique 'desc,seq' seqs.fasta > grouped_uniques.fasta ``` @@ -185,7 +187,7 @@ From simple math to complicated filter [expressions](https://markschl.github.io/ ([QuickJS](https://bellard.org/quickjs)) offers countless possibilities for customized sequence processing. -
+
Example: filter FASTQ sequences by quality and length @@ -194,19 +196,19 @@ This [filter](https://markschl.github.io/seqtool-docs/filter) command removes se sequencing error (like [USEARCH](https://www.drive5.com/usearch/manual/exp_errs.html) can do) or sequence length of <100 bp. -```sh +```bash st filter 'exp_err < 1 && seqlen >= 100' reads.fastq > filtered.fastq ```
-### Header attributes +### Header attributes for metadata storage **`key=value` [header attributes](https://markschl.github.io/seqtool-docs/attributes)** allow storing and passing on all kinds of information -
+
Example: De-replicate by sequence (seq variable) and/or other properties @@ -214,7 +216,7 @@ Example: De-replicate by sequence (seq variable) and/or other properties The [unique](https://markschl.github.io/seqtool-docs/unique) command returns all unique sequences and annotates the number of records with the same sequence in the header: -```sh +```bash st unique seq -a abund={n_duplicates} input.fasta > uniques.fasta ``` @@ -229,7 +231,7 @@ GGAGGATCCGAGCG It is also possible to de-replicate by multiple keys, e.g. by sequence, but grouped by a `sample` attribute in the header: -```sh +```bash st unique 'seq,attr(sample)' input.fasta > uniques.fasta ``` @@ -247,7 +249,7 @@ SEQUENCE4
-
+
Example: pre-processing of mixed multi-marker amplicon sequences (primer trimming, grouping by amplicon) @@ -258,7 +260,7 @@ multi-marker amplicons. and finally [split](https://markschl.github.io/seqtool-docs/split) distributes the sequences into different files named by the forward primer. -`primers.fasta` +**primers.fasta** ``` >prA @@ -267,16 +269,18 @@ PRIMER PRIMER ``` -```sh +**Command for searching/trimming** + +```bash st find file:primers.fasta -a primer='{pattern_name}' -a end='{match_end}' sequences.fasta | st trim -e '{attr(end)}..' | st split -o '{attr(primer)}' ``` - +
- - + - -
prA.fasta prB.fastaundefined.fasta
+
``` >id1 primer=prA end=22 @@ -287,7 +291,7 @@ SEQUENCE ``` + ``` >id2 primer=prB end=20 @@ -298,7 +302,7 @@ SEQUENCE ``` + ``` >id5 primer=undefined end=undefined @@ -315,19 +319,19 @@ UNTRIMMEDSEQUENCE -### Metadata integration +### Integration of external metadata Integration of [**sequence metadata sources**](https://markschl.github.io/seqtool-docs/meta) in the form of delimited text -
+
Example: Add Genus names from a separate tab-separated list - +
- - + -
input.fastagenus.tsv
+
``` >id1 @@ -338,7 +342,7 @@ SEQUENCE ``` + ``` id genus @@ -353,14 +357,14 @@ seq2 Amycolatopsis Using `-m/--meta` to include `genus.tsv` as metadata source: -```sh +```bash st set -m genus.tsv --desc '{meta(genus)}' input.fasta > with_genus.fasta ``` - +
- - +
with_genus.fasta
+
``` >seq1 Actinomyces @@ -375,15 +379,15 @@ SEQUENCE
-
+
Example: Choose specific sequences given a separate file with an ID list - +
- - +
input.fastaid_list.txt
+
``` >id1 @@ -409,14 +413,14 @@ id4
-```sh +```bash st filter -m id_list.txt 'has_meta()' input.fasta > subset.fasta ``` - +
- - +
subset.fasta
+
``` >id1 @@ -442,7 +446,7 @@ attribute setting ### Subset/shuffle * **[sort](https://markschl.github.io/seqtool-docs/sort)**: Sort records by sequence or any other criterion -* **[unique](https://markschl.github.io/seqtool-docs/unique)**: General purpose tool for reading, modifying and writing biological sequences. +* **[unique](https://markschl.github.io/seqtool-docs/unique)**: De-replicate by sequence and/or other properties, returning only unique records * **[filter](https://markschl.github.io/seqtool-docs/filter)**: Keep/exclude sequences based on different properties with a mathematical (JavaScript) expression * **[split](https://markschl.github.io/seqtool-docs/split)**: Distribute sequences into multiple files based on a variable/function or @@ -470,3 +474,14 @@ concatenate several ranges * **[revcomp](https://markschl.github.io/seqtool-docs/revcomp)**: Reverse complements DNA or RNA sequences * **[concat](https://markschl.github.io/seqtool-docs/concat)**: Concatenates sequences/alignments from different files +## Comparison with other tools + +There are other tools with a similar focus such as [Seqtk](https://github.com/lh3/seqtk), +[SeqKit](https://github.com/shenwei356/seqkit), the [FASTX-Toolkit](https://github.com/agordon/fastx_toolkit), +as well as the more specialized [USEARCH](https://www.drive5.com/usearch) and +[VSEARCH](https://github.com/torognes/vsearch) offering some of the functions +as well. + +*Seqtool* performs well compared to these tools on a selection of diverse tasks: + +**[Comparison of tools](https://markschl.github.io/seqtool-docs/comparison)** diff --git a/profile/README.md b/profile/README.md new file mode 100644 index 0000000..d121299 --- /dev/null +++ b/profile/README.md @@ -0,0 +1,79 @@ +# Measuring time and memory / comparison to other tools + +The following should work for Ubuntu Linux. + +```bash +outdir=target/st_benchmark +fq=$outdir/reads.fq +mkdir -p $outdir +``` + +## Build the binary + +```bash +cargo build --release +st=target/release/st +``` + +## Download sequencing reads + + +```bash +wget -qi profile/fastq_urls.txt -O - | zcat > $fq +ls -lh $fq +``` + +## Create temporary storage + +We rely on *tmpfs* to store output (and some input) files in memory, +avoiding disk IO latency as much as possible. + +```bash +rm -Rf $outdir/workdir && mkdir $outdir/workdir +chmod 777 $outdir/workdir +sudo mount -t tmpfs -o size=10G none $outdir/workdir +mkdir -p $outdir/workdir/tmp +``` + +Prepare forward primer for searching + +```bash +cat > $outdir/workdir/primers.fasta <<- EOM +>ITS4 +GTCCTCCGCTTATTGATATGC +EOM +``` + +## Run the benchmarks + +Before running, disable frequency boost: + +```bash +echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost +``` + +On Ubuntu, disable the indexer for full-text search + +```bash +echo -n > $outdir/workdir/.trackerignore +``` + +Run the comparison. The `compare_tools.py` does not only compare runtimes / memory usage, +but in some cases also validates that the output is the same. +See `comparison_commands.yaml`. + +```bash +export SEQKIT_THREADS=1 +$st count $fq # cache the file in memory +scripts/compare_tools.py \ + -b $st -d $outdir/workdir -o profile/comparison.json -t $outdir/workdir/tmp \ + $fq profile/comparison_commands.yaml + +scripts/summarize_comparison.py profile/comparison.json - > profile/comparison.md +``` + +## Clean up + +```bash +rm -Rf $outdir/workdir +``` diff --git a/profile/comparison_commands.yaml b/profile/comparison_commands.yaml new file mode 100644 index 0000000..2baa426 --- /dev/null +++ b/profile/comparison_commands.yaml @@ -0,0 +1,305 @@ +pass: + no_action: + description: Do nothing, just read and write FASTA + cmd: st pass input.fasta > output.fasta + other: + SeqKit: seqkit seq [-w 0] input.fasta > output.fasta + compare_with: [SeqKit] + fasta: + description: Convert FASTQ to FASTA + cmd: st pass --to-fa input.fastq > output.fasta + other: + FASTX-Toolkit: fastq_to_fasta -Q33 -i input.fastq > output.fasta + Seqtk: seqtk seq -A input.fastq > output.fasta + SeqKit: seqkit fq2fa input.fastq > output.fasta + compare_with: [SeqKit] + qual_scores: + description: Convert FASTQ quality scores + cmd: st pass --to fastq-illumina input.fastq > output.fastq + other: + VSEARCH: vsearch --fastq_convert input.fastq --fastq_asciiout 64 --fastqout output.fastq + SeqKit: seqkit convert --from 'Sanger' --to 'Illumina-1.3+' input.fastq > output.fastq + compare_with: [SeqKit] + gzip_compress: + description: Write compressed FASTQ files in GZIP format + cmd: st pass input.fastq -o output.fastq.gz + other: + SeqKit: seqkit seq input.fastq -o output.fastq.gz + seqtool | gzip: st pass input.fastq | gzip -c > output.fastq.gz + gzip directly: gzip -kf input.fastq + pigz directly (4 threads): pigz -p4 -kf input.fastq + compress_zstd: + description: Write compressed FASTQ files in Zstandard format + cmd: st pass input.fastq -o output.fastq.zst + other: + seqtool | zstd piped: st pass input.fastq | zstd -c > output.fastq.zst + compress_lz4: + description: Write compressed FASTQ files in Lz4 format + cmd: st pass input.fastq -o output.fastq.lz4 + other: + seqtool | lz4 piped: st pass input.fastq | lz4 -c > output.fastq.lz4 + + +count: + all: + description: Count the number of FASTQ sequences in the input + cmd: st count input.fastq + other: + Seqtk: seqtk size input.fasta + compare: + seqtk: + - st count input.fastq > output.tsv + - seqtk size input.fastq | cut -f1 > output.tsv + gc: + description: Count the number of FASTQ sequences, grouped by GC content (in 10% intervals) + cmd: st count -k 'bin(gc_percent, 10)' input.fastq + other: + st with math expression: st count -k '{bin(gc_percent/100*100, 10)}' input.fastq + + +sort: + sequence: + description: Sort by sequence + cmd: st sort seq input.fasta > output.fasta + other: + SeqKit: seqkit sort -s [-w 0] input.fasta > output.fasta + sequence_lim: + description: Sort by sequence with ~ 50 MiB memory limit + cmd: st sort seq input.fasta -M 50M > output.fasta + other: + 100 MiB memory limit: st sort seq input.fasta -M 100M > output.fasta + + id: + description: Sort by record ID + cmd: st sort id input.fasta > output.fasta + other: + SeqKit: seqkit sort [-w 0] input.fasta > output.fasta + compare_with: [SeqKit] + length: + description: Sort by sequence length + cmd: st sort seqlen input.fasta > output.fasta + other: + SeqKit: seqkit sort -l [-w 0] input.fasta > output.fasta + VSEARCH: vsearch --sortbylength input.fasta --output output.fasta [--fasta_width 0] + size: + description: Sort sequences by USEARCH/VSEARCH-style abundance annotations + cmd: > + ST_ATTR_FMT=';key=value' st unique seq -a size={n_duplicates} input.fasta | + st sort '{-attr("size")}' > output.fasta + other: + VSEARCH: > + vsearch --derep_fulllength input.fasta --output - --sizeout | + vsearch --sortbysize - --output output.fasta [--fasta_width 0] + compare_with: [] + + +unique: + seqhash: + description: > + Remove duplicate sequences using sequence hashes. + This is more memory efficient and usually faster than keeping the whole + sequence around. + cmd: st unique seqhash input.fasta > output.fasta + other: + SeqKit: seqkit rmdup -sP [-w 0] input.fasta > output.fasta + compare_with: [SeqKit] + + seqhash_ignorecase: + description: > + Remove duplicate sequences using sequence hashes (case-insensitive). + cmd: st unique 'seqhash(true)' input.fasta > output.fasta + other: + VSEARCH: vsearch --derep_smallmem input.fasta --fastaout output.fasta [--fasta_width 0] + SeqKit: seqkit rmdup -sPi [-w 0] input.fasta > output.fasta + compare_with: [SeqKit] + compare: + vsearch_id_sorted_smallmem: + - st unique 'seqhash(true)' input.fasta | st sort id | st del -d > output.fasta + - vsearch --derep_smallmem input.fasta --fastaout - | st sort id > output.fasta + + seq: + description: > + Remove duplicate sequences that are exactly identical (case-insensitive); + comparing full sequences instead of not hashes (requires more memory). + VSEARCH additionally treats 'T' and 'U' in + the same way (seqtool doesn't). + cmd: st unique upper_seq input.fasta > output.fasta + other: + seqtool (sorted by sequence): st unique -s upper_seq input.fasta > output.fasta + VSEARCH: vsearch --derep_fulllength input.fasta --output output.fasta [--fasta_width 0] + compare: + st_mem: + - st unique -s seq input.fasta > output.fasta + - st unique -s -M 50M seq input.fasta > output.fasta + vsearch_id_sorted: + - st unique seq input.fasta | st sort id | st del -d > output.fasta + - vsearch --derep_fulllength input.fasta --output - | st sort id > output.fasta + + seq_smallmem: + description: Remove duplicate sequences (exact mode) with a memory limit of ~50 MiB + cmd: st unique seq -M 50M input.fasta > output.fasta + + seq_both: + description: Remove duplicate sequences, checking both strands + cmd: st unique seqhash_both input.fasta > output.fasta + other: + SeqKit: seqkit rmdup -s [-w 0] input.fasta > output.fasta + compare_with: [SeqKit] + + size_annot: + description: > + Remove duplicate sequences, appending USEARCH/VSEARCH-style abundance + annotations to the headers: *>id;size=NN* + cmd: st unique seq -a size={n_duplicates} --attr-fmt ';key=value' input.fasta > output.fasta + other: + VSEARCH: vsearch --derep_fulllength input.fasta --sizeout --output output.fasta [--fasta_width 0] + compare: + vsearch_id_sorted: + - st unique seq input.fasta -a size={n_duplicates} --attr-fmt ';key=value' | st del -d | st sort id > output.fasta + - vsearch --derep_fulllength input.fasta --output - --sizeout --fasta_width 0 | st sort id > output.fasta + + id_seq: + description: > + De-replicate both by sequence *and* record ID (the part before the first space in the header). + The given benchmark actually has unique sequence IDs, so the result is the same as de-replication + by sequence. + cmd: st unique id,seq input.fasta > output.fasta + other: + VSEARCH: vsearch --derep_id input.fasta --output output.fasta + compare: + vsearch_id_sorted: + - st unique id,seq input.fasta | st del -d | st sort id > output.fasta + - vsearch --derep_id input.fasta --output - --fasta_width 0 | st sort id > output.fasta + + +filter: + seqlen: + description: Filter sequences by length + cmd: st filter 'seqlen >= 100' input.fastq > output.fastq + other: + Seqtk: seqtk seq -L 100 input.fastq > output.fastq + SeqKit: seqkit seq -m 100 input.fastq > output.fastq + compare_with: [Seqtk, SeqKit] + quality: + description: > + Filter sequences by the total expected error as calculated from the + quality scores + cmd: st filter 'exp_err <= 1' input.fastq --to-fa > output.fastq + other: + VSEARCH: vsearch --fastq_filter input.fastq --fastq_maxee 1 --fastaout output.fasta [--fasta_width 0] + USEARCH: usearch -fastq_filter input.fastq -fastq_maxee 1 -fastaout output.fasta + compare_with: [VSEARCH] + select_ids: + description: Select records from a large set of sequences given a list of 1000 sequence IDs + prepare: st sample -n 1000 input.fasta --to-tsv id > ids_list.txt + cleanup: rm ids_list.txt + cmd: st filter -m ids_list.txt 'has_meta()' input.fasta > output.fasta + other: + VSEARCH: vsearch --fastx_getseqs input.fasta --labels ids_list.txt --fastaout output.fasta + SeqKit: seqkit grep -f ids_list.txt input.fasta > output.fasta + + +sample: + n: + description: Random subsampling to 1000 of sequences + cmd: st sample -n 1000 input.fasta > output.fasta + other: + VSEARCH: vsearch --fastx_subsample input.fasta --sample_size 1000 --fastaout output.fasta + Seqtk: seqtk sample input.fasta 1000 > output.fasta + SeqKit: seqkit sample -n 1000 input.fasta > output.fasta + proportion: + description: Random subsampling to ~10% of sequences + cmd: st sample -p 0.1 input.fasta > output.fasta + other: + Seqtk: seqtk sample input.fastq 0.1 > output.fasta + SeqKit: seqkit sample -p 0.1 input.fastq > output.fasta + + +find: + primer_location: + description: Find the forward primer location in the input reads with up to 4 mismatches + cmd: st find -D4 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq + other: + st (4 threads): st find -t4 -D4 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq + st (max. mismatches = 2): st find -D2 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq + st (max. mismatches = 8): st find -D8 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq + find_trim_primer: + description: > + Find and trim the forward primer up to an error rate (edit distance) of 20%, + discarding unmatched reads. + *Note:* Unlike Cutadapt, seqtool currently does not offer ungapped alignments + (`--no-indels`). + cmd: > + st find -f file:primers.fasta -R 0.2 input.fastq -a primer={pattern_name} -a end={match_end} | + st trim -e '{attr(end)}:' --fq > output.fastq + other: + Cutadapt: > + cutadapt -g 'file:primers.fasta;min_overlap=15' input.fastq -e 0.2 --rename '{id} primer={adapter_name}' --discard-untrimmed > output.fastq + find_trim_primer_j4: + description: > + Find and trim the forward primer in parallel using 4 threads (cores). + cmd: > + st find -f file:primers.fasta -R 0.2 -t4 input.fastq -a primer={pattern_name} -a end={match_end} | + st trim -e '{attr(end)}:' --fq > output.fastq + other: + Cutadapt: > + cutadapt -j4 -g 'file:primers.fasta;min_overlap=15' input.fastq -e 0.2 --rename '{id} primer={adapter_name}' --discard-untrimmed > output.fastq + + +replace: + dna_to_rna: + description: Convert DNA to RNA using the replace command + cmd: st replace T U input.fasta > output.fasta + other: + st find: st find T --rep U input.fasta > output.fasta + SeqKit: seqkit seq --dna2rna [-w 0] input.fasta > output.fasta + FASTX-Toolkit: fasta_nucleotide_changer -r -i input.fasta > output.fasta + compare_with: + ['st find', SeqKit] + dna_to_rna_4: + description: Convert DNA to RNA using 4 threads + cmd: st replace -t4 T U input.fasta > output.fasta + other: + st find: st find -t4 T --rep U input.fasta > output.fasta + + +trim: + trim: + description: Trim the leading 99 bp from the sequences + cmd: "st trim 100: input.fasta > output.fasta" + other: + SeqKit (creates FASTA index): seqkit subseq -r '100:-1' [-w 0] input.fasta > output.fasta + compare_with: [SeqKit (creates FASTA index)] + + +upper: + upper: + description: Convert sequences to uppercase + cmd: st upper input.fasta > output.fasta + other: + Seqtk: seqtk seq -U input.fasta > output.fasta + SeqKit: seqkit seq -u [-w 0] input.fasta > output.fasta + compare_with: [Seqtk, SeqKit] + + +revcomp: + revcomp: + description: Reverse complement sequences + cmd: st revcomp input.fasta > output.fasta + other: + Seqtk: seqtk seq -r input.fasta > output.fasta + VSEARCH: vsearch --fastx_revcomp input.fasta --fastaout output.fasta [--fasta_width 0] + SeqKit: seqkit seq -rp [-w 0] input.fasta > output.fasta + compare_with: [Seqtk, SeqKit, VSEARCH] + + +concat: + concat: + description: Concatenate sequences, adding an `NNNNN` spacer inbetween + prepare: + - ln -s input.fastq file1.fastq + - ln -s input.fastq file2.fastq + cleanup: rm file1.fastq file2.fastq + cmd: st concat -s 5 -c N file1.fastq file2.fastq > output.fastq + other: + VSEARCH: vsearch --fastq_join file1.fastq --reverse file2.fastq --join_padgap NNNNN --fastqout output.fastq diff --git a/profile/fastq_urls.txt b/profile/fastq_urls.txt new file mode 100644 index 0000000..8e7ef37 --- /dev/null +++ b/profile/fastq_urls.txt @@ -0,0 +1,15 @@ +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/079/ERR12573579/ERR12573579_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/096/ERR12551596/ERR12551596_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/003/ERR12551603/ERR12551603_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/005/ERR12551605/ERR12551605_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/009/ERR12551609/ERR12551609_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/091/ERR12551691/ERR12551691_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/094/ERR12551694/ERR12551694_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/095/ERR12551695/ERR12551695_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/097/ERR12551697/ERR12551697_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/099/ERR12551699/ERR12551699_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/002/ERR12551702/ERR12551702_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/004/ERR12551704/ERR12551704_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/007/ERR12551707/ERR12551707_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/011/ERR12551711/ERR12551711_1.fastq.gz +ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/063/ERR12551763/ERR12551763_1.fastq.gz diff --git a/scripts/compare_tools.py b/scripts/compare_tools.py new file mode 100644 index 0000000..9c9b62c --- /dev/null +++ b/scripts/compare_tools.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 + +import os + + +def run_benches(binary, input_dir, config, temp_dir=None, kind=None): + assert os.path.basename(binary) == 'st', "The binary name must be 'st'" + d = os.path.abspath(os.path.dirname(binary)) + os.environ['PATH'] = d + ":" + os.environ['PATH'] + # sys.path.insert(0, d) + if temp_dir is not None: + temp_dir = os.path.relpath(temp_dir, input_dir) + os.chdir(input_dir) + out = {} + for cmd, benches in config.items(): + cmd_out = out[cmd] = {} + for bench, cfg in benches.items(): + print("bench", bench) + cmd_out[bench] = run_bench(bench, cfg, temp_dir=temp_dir, kind=kind) + return out + +def run_bench(name, cfg, temp_dir=None, kind=None): + # print(cfg.get('description', name)) + # from pprint import pprint; pprint(cfg) + st_cmd = cfg['cmd'] + # do time measurements + out = {'description': cfg.get('description', name), 'other': {}} + if 'prepare' in cfg: + if isinstance(cfg['prepare'], str): + cfg['prepare'] = [cfg['prepare']] + for cmd in cfg['prepare']: + call(cmd, shell=True) + if kind is None or 'main' in kind: + out['st'], outfile = run_command(st_cmd, temp_dir=temp_dir) + if outfile is not None: + os.remove(outfile) + if kind is None or 'other' in kind: + for what, cmd in cfg.get('other', {}).items(): + # print(what, cmd, len(cmd)) + out['other'][what], outfile = run_command(cmd, temp_dir=temp_dir) + if outfile is not None: + os.remove(outfile) + # run comparisons + if kind is None or 'comparisons' in kind: + out1 = 'out1' + out2 = 'out2' + if 'compare_with' in cfg: + _, outfile = run_command(st_cmd, temp_dir=temp_dir) + os.rename(outfile, out1) + for what in cfg['compare_with']: + # print("compare", what) + cmd = cfg.get('other', {})[what] + _, outfile = run_command(cmd) + os.rename(outfile, out2) + assert_identical(out1, out2, st_cmd, cmd) + os.remove(out2) + os.remove(out1) + for what, cmds in cfg.get('compare', {}).items(): + # print("compare other", what) + assert len(cmds) == 2 + _, outfile = run_command(cmds[0], temp_dir=temp_dir) + os.rename(outfile, out1) + _, outfile = run_command(cmds[1], temp_dir=temp_dir) + os.rename(outfile, out2) + assert_identical(out1, out2, *cmds) + os.remove(out1) + os.remove(out2) + if 'cleanup' in cfg: + if isinstance(cfg['cleanup'], str): + cfg['cleanup'] = [cfg['cleanup']] + for cmd in cfg['cleanup']: + call(cmd, shell=True) + return out + +def assert_identical(f1, f2, cmd1, cmd2): + import subprocess + try: + subprocess.check_call(["diff", "-q", f1, f2]) + except subprocess.CalledProcessError: + import sys + print( + "COMPARISON ERROR: command output differs for:\n- {}\n- {}\nCheck with diff {} {}".format( + cmd1, cmd2, os.path.abspath(f1), + os.path.abspath(f2), os.path.abspath(f2)), + file=sys.stderr + ) + + +def run_command(command, temp_dir=None): + import re + from tempfile import mkstemp + # look for an [[alternative command]] to use instead (appended to command that is displayed) + mod_cmd = command + m = re.search(r"^(.+?)\[\[(.+?)\]\] *$", mod_cmd) + if m is not None: + command, mod_cmd = m.groups() + # look for individual args that should be used, but not be displayed by default + command = re.sub(r' +\[.+?\]', ' ', command) + mod_cmd = re.sub(r' +\[(.+?)\]', r' \1', mod_cmd) + print(command) + if command != mod_cmd: + print("[[ {} ]]".format(mod_cmd)) + _, time_out = mkstemp(prefix="st_time", dir=temp_dir) + # for running the command, we use Bash here + _cmd = ['/usr/bin/time', '-o', time_out, '-f', r'%e\t%M\t%P', 'bash', '-c', mod_cmd] + stdout, stderr = call(_cmd) + with open(time_out) as f: + # we just use the last line, since the first line will have an error message + # if the command fails + info = list(f.read().splitlines())[-1].split('\t') + os.remove(time_out) + result = { + 'cmd': command, + 'modified_cmd': None if mod_cmd == command else mod_cmd, + 'stdout': stdout.decode('utf-8'), + 'stderr': stderr.decode('utf-8'), + 'elapsed': float(info[0]), + # https://stackoverflow.com/questions/61392725/kilobytes-or-kibibytes-in-gnu-time + 'max_mib': float(info[1])/1024., + 'cpu': float(re.sub('%$', '', info[2])), + } + # Get output file name + m = re.search(r'\b(output(\.\w+)+)( |$)', mod_cmd) + outfile = None if m is None else m.group(1) + return result, outfile + + +def call(cmd, **kwarg): + import subprocess + # print("call", cmd) + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwarg) + out, err = p.communicate() + if p.returncode != 0: + import sys + print("ERROR: non-zero exit code {} for command:\n{}\nstderr:\n{}".format(p.returncode, cmd, err.decode('utf-8')), file=sys.stderr) + return out, err + + +def generate_files(fastq_path, outdir): + if not os.path.exists(outdir): + os.makedirs(outdir) + fq = os.path.join(outdir, 'input.fastq') + if os.path.exists(fq): + os.remove(fq) + os.symlink(os.path.abspath(fastq_path), os.path.abspath(fq)) + fa = os.path.splitext(fq)[0] + '.fasta' + if not os.path.exists(fa): + call(['st', '.', '--to-fa', fq, '-o', fa]) + + +if __name__ == '__main__': + import argparse + import yaml + import json + + parser = argparse.ArgumentParser() + parser.add_argument('fastq') + parser.add_argument('configfile') + parser.add_argument('-o', '--output', type=argparse.FileType('w'), default='-') + parser.add_argument('-s', '--selection', + help='comma-separated list of benchmarks to run') + parser.add_argument('-b', '--binary', default='target/release/st') + parser.add_argument('-d', '--input-dir', default='target/st_benchmark') + parser.add_argument('-t', '--temp-dir') + parser.add_argument('-k', '--kind', default='main,other,comparisons', + help="Comma-separated list of benchmark types to run. " + "Possible are 'main' (the seqtool commands), 'other' (other tools) " + "and 'comparisons' (comparison of output from different tools). " + "Default is to run all.") + args = parser.parse_args() + + generate_files(args.fastq, args.input_dir) + + with open(args.configfile) as f: + config = yaml.safe_load(f) + + if args.selection: + sel = [s.strip() for s in args.selection.split(',')] + config = {k: v for k, v in config.items() if k in sel} + + # print(config) + kind = [k.strip() for k in args.kind.split(',')] + out = run_benches(args.binary, args.input_dir, config, temp_dir=args.temp_dir, kind=kind) + + json.dump(out, args.output, indent=2) diff --git a/scripts/summarize_comparison.py b/scripts/summarize_comparison.py new file mode 100644 index 0000000..72976ad --- /dev/null +++ b/scripts/summarize_comparison.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +import json +import re + + +def gen_summary(json_input, md_out): + from html import escape + + def fmt_bench(d): + return '{}{}
{}'.format( + format_time(d), + ' {:.0f}% CPU'.format(d['cpu']) if abs(100 - d['cpu']) > 5 else '', + format_mem(d), + ) + + def find_best(d): + runs = [d['st']] + list(d.get('other', {}).values()) + if len(runs) > 1: + times = sorted((r['elapsed'], i) for i, r in enumerate(runs)) + if times[0][0] > 0: + runs[times[0][1]]['fastest'] = times[1][0] / times[0][0] + mem = sorted((r['max_mib'], i) for i, r in enumerate(runs)) + if mem[0][0] > 0: + runs[mem[0][1]]['lowest_mem'] = mem[1][0] / mem[0][0] + + def format_time(d): + f = d.get('fastest') + if f is not None: + return '🕓 {:.1f} s 🏆 ({:.1f}x)'.format(d['elapsed'], f) + return '🕓 {:.1f} s'.format(d['elapsed']) + + def format_mem(d): + f = d.get('lowest_mem') + if f is not None: + return '📈 {:.1f} MiB 🏆 ({:.2f}x)'.format(d['max_mib'], f) + return '📈 {:.1f} MiB'.format(d['max_mib']) + + def fmt_output(d): + strip_newlines = lambda msg: re.sub(r'(?ms:[\r\n\s]+$)', '', msg) + out = '' + if d['stdout']: + # TODO: get this to work: https://github.com/squidfunk/mkdocs-material/issues/4964 + out += '
🟦 output\n\n```\n{}\n```\n\n
\n'.format(strip_newlines(d['stdout'])) + if d['stderr']: + out += '
 messages\n\n```\n{}\n```\n\n
\n'.format(strip_newlines(d['stderr'])) + return out + + for command, comparisons in json.load(json_input).items(): + md_out.write('## {}\n'.format(command)) + md_out.write('\n\n') + for comparison, d in comparisons.items(): + find_best(d) + st = d['st'] + md_out.write('\n\n\n'.format(escape(d.get('description', comparison)))) + md_out.write('\n\n\n\n\n\n'.format(fmt_bench(st))) + md_out.write('
\n\n{}\n\n\n\n```bash\n{}\n```\n\n{}\n'.format(st['cmd'], fmt_output(st))) + if 'other' in d and d['other']: + md_out.write('
{}\n\n\n'.format( + "  ❙ ".join('{} {}'.format( + escape(tool), format_time(o), + ) + for tool, o in d['other'].items()) + ) + ) + for tool, o in d['other'].items(): + code = ''.format( + o['cmd'].replace('\n', ' '), + fmt_output(o) + ) + md_out.write('\n\n\n\n{}\n\n\n\n\n\n'.format( + escape(tool), + code, + fmt_bench(o) + )) + md_out.write('
\n\n```bash\n{}\n```\n\n{}
{}{}
\n\n
\n\n') + md_out.write('
{}
\n\n') + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('json_input', type=argparse.FileType('r')) + parser.add_argument('md_out', type=argparse.FileType('w')) + args = parser.parse_args() + + gen_summary(**vars(args)) diff --git a/src/cli.rs b/src/cli.rs index 83315e7..aa2e5de 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -332,10 +332,9 @@ pub enum SubCommand { /// Sort records by sequence or any other criterion #[cfg(any(feature = "all-commands", feature = "sort"))] Sort(cmd::sort::cli::SortCommand), - /// De-replicate records by sequence or any other criterion, returning only + /// De-replicate by sequence and/or other properties, returning only /// unique records #[cfg(any(feature = "all-commands", feature = "unique"))] - #[clap(about, long_about)] Unique(cmd::unique::cli::UniqueCommand), /// Keep/exclude sequences based on different properties with a /// mathematical (JavaScript) expression diff --git a/src/cmd/count.rs b/src/cmd/count.rs index d2be0b4..5a6ef9a 100755 --- a/src/cmd/count.rs +++ b/src/cmd/count.rs @@ -11,12 +11,9 @@ use crate::helpers::{value::SimpleValue, DefaultHashMap as HashMap}; use crate::var::varstring::VarString; pub const DESC: &str = " -Records in all the provided input (including multiple files) are counted -collectively. - -In addition, records can be summarized over one or more categories -specified with `-k/--key`. The reported counts are always sorted by the -numeric or text category. +The overall record count is returned for all input files collectively. +Optionally, grouping categories (text or numeric) can be specified using +`-k/--key`. The tab-delimited output is sorted by the categories. "; lazy_static::lazy_static! { @@ -49,9 +46,9 @@ pub struct CountCommand { /// Maximum number of categories to count before aborting with an error. /// This limit is a safety measure to prevent memory exhaustion. - /// Usually, a very large number of categories is not intended and may - /// happen if continuous numbers are not categorized with the - /// `bin(, )` function. + /// A very large number of categories could unintentionally occur with a + /// condinuous numeric key (e.g. `gc_percent`). These can be grouped into + /// regular intervals using `bin(, )`. #[arg(short = 'l', long, default_value_t = 1000000)] category_limit: usize, diff --git a/src/cmd/find/vars.rs b/src/cmd/find/vars.rs index 8aea1a2..2f40e15 100755 --- a/src/cmd/find/vars.rs +++ b/src/cmd/find/vars.rs @@ -22,7 +22,7 @@ variable_enum! { /// /// # Examples /// - /// Find a primer sequence with up to 2 mismatches (`-d/--dist``) and write + /// Find a primer sequence with up to 2 mismatches (`-d/--dist`) and write /// the match range and the mismatches ('dist') to the header as attributes. /// The result will be 'undefined' (=undefined in JavaScript) if there are > 2 mismatches /// @@ -32,7 +32,7 @@ variable_enum! { /// SEQUENCE /// >id2 rng=1:20 dist=0 /// SEQUENCE - /// >id3 rng= dist= + /// >id3 rng=undefined dist=undefined /// SEQUENCE /// (...) /// @@ -41,7 +41,7 @@ variable_enum! { /// while non-matching sequences are written to 'no_primer.fasta' /// /// `st find -f -d 2 CTTGGTCATTTAGAGGAAGTAA --dropped no_primer.fasta -a end={match_end} reads.fasta | - /// st trim -e '{attr(match_end)}..' > primer_trimmed.fasta` + /// st trim -e '{attr(match_end)}:' > primer_trimmed.fasta` /// /// /// Search for several primers with up to 2 mismatches and write the name and mismatches @@ -53,7 +53,7 @@ variable_enum! { /// SEQUENCE /// >id1 primer=primer_2 dist=0 /// SEQUENCE - /// >id1 primer= dist= + /// >id1 primer=undefined dist=undefined /// SEQUENCE /// (...) FindVar { @@ -145,7 +145,7 @@ variable_enum! { /// to the pattern MatchSubst(Number) { hit: String = String::from("1"), pattern: usize = 1 }, /// Name of the matching pattern (patterns supplied with `file:patterns.fasta`). - /// In case a single pattern was specified in the commandline, this will just be ``. + /// In case a single pattern was specified in the commandline, this will just be **. /// `pattern_name(rank)` selects the n-th matching pattern, sorted by edit distance /// and/or pattern number (depending on `-D/-R` and `--in-order`). PatternName(Text) { rank: usize = 1 }, diff --git a/src/cmd/mask.rs b/src/cmd/mask.rs index cca7cf9..8057064 100755 --- a/src/cmd/mask.rs +++ b/src/cmd/mask.rs @@ -1,13 +1,19 @@ use clap::Parser; -use crate::cli::CommonArgs; +use crate::cli::{CommonArgs, WORDY_HELP}; use crate::config::Config; use crate::error::CliResult; use crate::helpers::var_range::VarRanges; use crate::io::SeqQualRecord; +pub const DESC: &str = "\ +Masks the sequence within a given range or comma delimited list of ranges +by converting to lowercase (soft mask) or replacing with a character (hard +masking). Reverting soft masking is also possible."; + #[derive(Parser, Clone, Debug)] #[clap(next_help_heading = "'Mask' command options")] +#[clap(before_help=DESC, help_template=WORDY_HELP)] pub struct MaskCommand { /// Range in the form 'start:end' or 'start:' or ':end', /// The range start/end may be defined by varialbes/functions, diff --git a/src/cmd/trim.rs b/src/cmd/trim.rs index ee47799..d0f6ee8 100755 --- a/src/cmd/trim.rs +++ b/src/cmd/trim.rs @@ -1,19 +1,13 @@ use clap::Parser; -use crate::cli::{CommonArgs, WORDY_HELP}; +use crate::cli::CommonArgs; use crate::config::Config; use crate::error::CliResult; use crate::helpers::{rng::Range, var_range::VarRanges}; use crate::io::{Record, SeqQualRecord}; -pub const DESC: &str = "\ -Masks the sequence within a given range or comma delimited list of ranges -by converting to lowercase (soft mask) or replacing with a character (hard -masking). Reverting soft masking is also possible."; - #[derive(Parser, Clone, Debug)] #[clap(next_help_heading = "'Trim' command options")] -#[clap(before_help=DESC, help_template=WORDY_HELP)] pub struct TrimCommand { /// Range(s) in the form 'start:end' or 'start:' or ':end', /// Multiple ranges can be supplied as comma-delimited list: diff --git a/var_provider/src/var_provider.rs b/var_provider/src/var_provider.rs index e040344..e0a6a78 100644 --- a/var_provider/src/var_provider.rs +++ b/var_provider/src/var_provider.rs @@ -139,14 +139,14 @@ pub trait VarProviderInfo: fmt::Debug { // .into_iter() // .map(|u| format!("{}`", u)) .join("
"); - // TODO: very simple but fine for a help page + // TODO: very simple but... but probably fine for a help page let desc = info .description .replace('<', r"\<") .replace('>', r"\>") .replace('\n', "
"); - writeln!(out, "| {} | {} |", usages, desc)?; + writeln!(out, "| {} | {} |", info.name, usages, desc)?; } } writeln!(out)?; @@ -161,7 +161,7 @@ pub trait VarProviderInfo: fmt::Debug { writeln!(out, "### {}", ex)?; for example in examples { writeln!(out, "{}:", example.description)?; - writeln!(out, "```sh\n{}\n```", example.command)?; + writeln!(out, "```bash\n{}\n```", example.command)?; if let Some(output) = example.output { writeln!(out, "```\n{}\n```", output)?; } diff --git a/wix/main.wxs b/wix/main.wxs index f2c4209..8aa5ab0 100644 --- a/wix/main.wxs +++ b/wix/main.wxs @@ -69,7 +69,7 @@