From 35ef42580f44b97039718f356a7fa52b1ec84d61 Mon Sep 17 00:00:00 2001
From: markschl <markschl19@gmail.com>
Date: Mon, 16 Sep 2024 15:12:48 +0200
Subject: [PATCH] Add comparison of tools & adjust docs

* Added some scripts for profiling seqtool
  and other tools
* Adjusted some docs (see also seqtool-docs repo)
---
 Cargo.toml                       |   2 +-
 README.md                        | 151 ++++++++-------
 profile/README.md                |  79 ++++++++
 profile/comparison_commands.yaml | 305 +++++++++++++++++++++++++++++++
 profile/fastq_urls.txt           |  15 ++
 scripts/compare_tools.py         | 185 +++++++++++++++++++
 scripts/summarize_comparison.py  |  87 +++++++++
 src/cli.rs                       |   3 +-
 src/cmd/count.rs                 |  15 +-
 src/cmd/find/vars.rs             |  10 +-
 src/cmd/mask.rs                  |   8 +-
 src/cmd/trim.rs                  |   8 +-
 var_provider/src/var_provider.rs |   6 +-
 wix/main.wxs                     |   2 +-
 14 files changed, 779 insertions(+), 97 deletions(-)
 create mode 100644 profile/README.md
 create mode 100644 profile/comparison_commands.yaml
 create mode 100644 profile/fastq_urls.txt
 create mode 100644 scripts/compare_tools.py
 create mode 100644 scripts/summarize_comparison.py

diff --git a/Cargo.toml b/Cargo.toml
index e4da5a1..7572f3c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ name = "seqtool"
 version = "0.4.0-beta.2"
 edition = "2021"
 authors = ["Markus Schlegel <markschl19@gmail.com>"]
-description = "General purpose tool for reading, modifying and writing biological sequences."
+description = "General-purpose tool for reading, modifying and writing biological sequences."
 license = "MIT"
 repository = "https://github.com/markschl/seqtool"
 readme = "README.md"
diff --git a/README.md b/README.md
index 0132d8f..dfbfabe 100755
--- a/README.md
+++ b/README.md
@@ -1,55 +1,61 @@
 **Seqtool** is a  fast and flexible command line program for dealing with
 large amounts of biological sequences.
-It provides different subcommands for *converting*, *inspecting* and *modifying*
-sequences.
+It provides different [subcommands](#commands) for *converting*, *inspecting*
+and *modifying* sequences.
 The standalone binary (~6 MB) is simply named `st` to save some typing.
 
+[![CI](https://github.com/markschl/seqtool/actions/workflows/ci.yaml/badge.svg)](https://github.com/markschl/seqtool/actions/workflows/ci.yaml)
+
 > **Note:** this page describes the development version 0.4-beta.
 > **The older stable version (v0.3.0) is [documented here](https://github.com/markschl/seqtool/wiki).**
 
-> ⚠ Also note that **there are some bugs in v0.3.0**,
-> see [CHANGELOG](https://github.com/markschl/seqtool/blob/main/CHANGELOG.md#important-bugfixes-).
-> Alternatively, v0.4.0-beta should be pretty safe to use already.
 
-**[📥 download stable release (v0.3.0)](https://github.com/markschl/seqtool/releases/latest)**
+## Detailed documentation
 
-**[📥 download beta release (v0.4.0-beta)](https://github.com/markschl/seqtool/releases/tag/0.4.0-beta.2)**
+**See [this page](https://markschl.github.io/seqtool-docs)**
 
-[![CI](https://github.com/markschl/seqtool/actions/workflows/ci.yaml/badge.svg)](https://github.com/markschl/seqtool/actions/workflows/ci.yaml)
 
+## Downloads
 
+**[📥 download stable release (v0.3.0)](https://github.com/markschl/seqtool/releases/latest)**
 
-## Detailed documentation
+> **⚠ Note**: there are a few **unfixed bugs in v0.3.0** (currently)
+> when reading GZIP files or searching/replacing;
+> see [CHANGELOG for v0.4.0-beta](https://github.com/markschl/seqtool/blob/main/CHANGELOG.md#important-bugfixes-).
+> Alternatively, consider using v0.4.0-beta.
 
-**See [this page](https://markschl.github.io/seqtool-docs)**
+**[📥 download beta release (v0.4.0-beta)](https://github.com/markschl/seqtool/releases/tag/0.4.0-beta.2)**
+
+> Should be pretty safe to use despite considerable refactoring.
+> Approximate matching (https://markschl.github.io/seqtool-docs/[find](find) command) is yet to be fully tested.
 
 
 ## Feature overview
 
 ### File formats
 
-[**Reads** and **writes**](https://markschl.github.io/seqtool-docs/pass) **FASTA, FASTQ** and **CSV/TSV**, optionally compressed
+[**Reads** and **writes**](https://markschl.github.io/seqtool-docs/formats) **FASTA, FASTQ** and **CSV/TSV**, optionally compressed
 with [GZIP](https://en.wikipedia.org/wiki/Gzip), [BZIP2](https://en.wikipedia.org/wiki/Bzip2),
 or the faster and more modern [Zstandard](http://facebook.github.io/zstd/) or [LZ4](https://lz4.org/)
 formats
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: compressed FASTQ to FASTA
 </summary>
 
 Combine multiple compressed FASTQ files, converting them to FASTA, using [pass](https://markschl.github.io/seqtool-docs/pass).
 
-> **Note**: almost every command can read multiple input files and convert between formats,
-> but *pass* does nothing other than reading and writing while other command perform certain actions.
-
-```sh
+```bash
 st pass file1.fastq.gz file2.fastq.gz -o output.fasta
 ```
 
+> **Note**: almost every command can read multiple input files and convert between formats,
+> but *pass* does nothing other than reading and writing while other command perform certain actions.
+
 </details>
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: FASTA to tab-separated list
 </summary>
@@ -57,7 +63,7 @@ Example: FASTA to tab-separated list
 Aside from ID and sequence, any [variable/function](https://markschl.github.io/seqtool-docs/variables) such as
 the sequence length (`seqlen`) can be written to  delimited text.
 
-```sh
+```bash
 st pass input.fasta --to-tsv id,seq,seqlen
 ``` 
 
@@ -70,23 +76,16 @@ id1	ACGTA	5
 </details>
 
 
-### Commands for many different tasks
-
-([see list below](#commands))
-
-### Highly versatile
+### Highly versatile thanks to variables/functions
 
-... thanks to **[variables/functions](https://markschl.github.io/seqtool-docs/variables)**
+See also **[variables/functions](https://markschl.github.io/seqtool-docs/variables)** for more details.
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: count sequences in a large set of FASTQ files
 </summary>
 
-In [count](https://markschl.github.io/seqtool-docs/count), one or several categorical [variables/functions](https://markschl.github.io/seqtool-docs/variables)
-can be specified with `-k/--key`.
-
-```sh
+```bash
 st count -k path data/*.fastq.gz
 ```
 
@@ -99,9 +98,12 @@ data/sample5.fastq.gz	7021
 (...)
 ```
 
+> In [count](https://markschl.github.io/seqtool-docs/count), one or several categorical [variables/functions](https://markschl.github.io/seqtool-docs/variables)
+> can be specified with `-k/--key`.
+
 </details>
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: summarize the GC content in 10% intervals
 </summary>
@@ -109,7 +111,7 @@ Example: summarize the GC content in 10% intervals
 The function `bin(variable, interval)` groups continuous numeric values
 into intervals
 
-```sh
+```bash
 st count -k 'bin(gc_percent, 10)' sequences.fasta
 ```
 
@@ -123,12 +125,12 @@ st count -k 'bin(gc_percent, 10)' sequences.fasta
 
 </details>
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: Assign new sequence IDs
 </summary>
 
-```sh
+```bash
 st set -i 'seq_{num}' seqs.fasta > renamed.fasta
 ```
 
@@ -144,7 +146,7 @@ SEQUENCE
 
 </details>
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: De-replicate by description and sequence
 </summary>
@@ -164,7 +166,7 @@ SEQUENCE1
 SEQUENCE1
 ```
 
-```sh
+```bash
 st unique 'desc,seq' seqs.fasta > grouped_uniques.fasta
 ```
 
@@ -185,7 +187,7 @@ From simple math to complicated filter [expressions](https://markschl.github.io/
 ([QuickJS](https://bellard.org/quickjs)) offers countless possibilities for customized
 sequence processing.
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: filter FASTQ sequences by quality and length
 </summary>
@@ -194,19 +196,19 @@ This [filter](https://markschl.github.io/seqtool-docs/filter) command removes se
 sequencing error (like [USEARCH](https://www.drive5.com/usearch/manual/exp_errs.html) can do)
 or sequence length of <100 bp.
 
-```sh
+```bash
 st filter 'exp_err < 1 && seqlen >= 100' reads.fastq > filtered.fastq
 ```
 
 </details>
 
 
-### Header attributes
+### Header attributes for metadata storage
 
 **`key=value` [header attributes](https://markschl.github.io/seqtool-docs/attributes)** allow storing and passing on
 all kinds of information
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: De-replicate by sequence (seq variable) and/or other properties  
 </summary>
@@ -214,7 +216,7 @@ Example: De-replicate by sequence (seq variable) and/or other properties
 The [unique](https://markschl.github.io/seqtool-docs/unique) command returns all unique sequences and annotates
 the number of records with the same sequence in the header:
 
-```sh
+```bash
 st unique seq -a abund={n_duplicates} input.fasta > uniques.fasta
 ```
 
@@ -229,7 +231,7 @@ GGAGGATCCGAGCG
 It is also possible to de-replicate by multiple keys, e.g. by sequence,
 but grouped by a `sample` attribute in the header:
 
-```sh
+```bash
 st unique 'seq,attr(sample)' input.fasta > uniques.fasta
 ```
 
@@ -247,7 +249,7 @@ SEQUENCE4
 
 </details>
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: pre-processing of mixed multi-marker amplicon sequences (primer trimming, grouping by amplicon)
 </summary>
@@ -258,7 +260,7 @@ multi-marker amplicons.
 and finally [split](https://markschl.github.io/seqtool-docs/split) distributes the sequences into different files named
 by the forward primer.
 
-`primers.fasta`
+**primers.fasta**
 
 ```
 >prA
@@ -267,16 +269,18 @@ PRIMER
 PRIMER
 ```
 
-```sh
+**Command for searching/trimming**
+
+```bash
 st find file:primers.fasta -a primer='{pattern_name}' -a end='{match_end}' sequences.fasta |
   st trim -e '{attr(end)}..' | 
   st split -o '{attr(primer)}'
 ```
 
-<table>
+<table markdown>
 <tr><th>prA.fasta </th><th>prB.fasta</th><th>undefined.fasta</th></tr>
-<tr>
-<td>
+<tr markdown>
+<td markdown>
 
 ```
 >id1 primer=prA end=22
@@ -287,7 +291,7 @@ SEQUENCE
 ```
 
 </td>
-<td>
+<td markdown>
 
 ```
 >id2 primer=prB end=20
@@ -298,7 +302,7 @@ SEQUENCE
 ```
 
 </td>
-<td>
+<td markdown>
 
 ```
 >id5 primer=undefined end=undefined
@@ -315,19 +319,19 @@ UNTRIMMEDSEQUENCE
 </details>
 
 
-### Metadata integration
+### Integration of external metadata
 
 Integration of [**sequence metadata sources**](https://markschl.github.io/seqtool-docs/meta) in the form of delimited text
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: Add Genus names from a separate tab-separated list
 </summary>
 
-<table>
+<table markdown>
 <tr><th>input.fasta</th><th>genus.tsv</th></tr>
-<tr>
-<td>
+<tr markdown>
+<td markdown>
 
 ```
 >id1
@@ -338,7 +342,7 @@ SEQUENCE
 ```
 
 </td>
-<td>
+<td markdown>
 
 ```
 id  genus
@@ -353,14 +357,14 @@ seq2  Amycolatopsis
 
 Using `-m/--meta` to include `genus.tsv` as metadata source:
 
-```sh
+```bash
 st set -m genus.tsv --desc '{meta(genus)}' input.fasta > with_genus.fasta
 ```
 
-<table>
+<table markdown>
 <tr><th>with_genus.fasta</th></tr>
-<tr>
-<td>
+<tr markdown>
+<td markdown>
 
 ```
 >seq1 Actinomyces
@@ -375,15 +379,15 @@ SEQUENCE
 </table>
 </details>
 
-<details markdown>
+<details markdown class="highlight">
 <summary>
 Example: Choose specific sequences given a separate file with an ID list
 </summary>
 
-<table>
+<table markdown>
 <tr><th>input.fasta</th><th>id_list.txt</th></tr>
-<tr>
-<td>
+<tr markdown>
+<td markdown>
 
 ```
 >id1
@@ -409,14 +413,14 @@ id4
 </table>
 
 
-```sh
+```bash
 st filter -m id_list.txt 'has_meta()' input.fasta > subset.fasta
 ```
 
-<table>
+<table markdown>
 <tr><th>subset.fasta</th></tr>
-<tr>
-<td>
+<tr markdown>
+<td markdown>
 
 ```
 >id1
@@ -442,7 +446,7 @@ attribute setting
 
 ### Subset/shuffle
 * **[sort](https://markschl.github.io/seqtool-docs/sort)**: Sort records by sequence or any other criterion
-* **[unique](https://markschl.github.io/seqtool-docs/unique)**: General purpose tool for reading, modifying and writing biological sequences.
+* **[unique](https://markschl.github.io/seqtool-docs/unique)**: De-replicate by sequence and/or other properties, returning only unique records
 * **[filter](https://markschl.github.io/seqtool-docs/filter)**: Keep/exclude sequences based on different properties with a mathematical
 (JavaScript) expression
 * **[split](https://markschl.github.io/seqtool-docs/split)**: Distribute sequences into multiple files based on a variable/function or
@@ -470,3 +474,14 @@ concatenate several ranges
 * **[revcomp](https://markschl.github.io/seqtool-docs/revcomp)**: Reverse complements DNA or RNA sequences
 * **[concat](https://markschl.github.io/seqtool-docs/concat)**: Concatenates sequences/alignments from different files
 
+## Comparison with other tools
+
+There are other tools with a similar focus such as [Seqtk](https://github.com/lh3/seqtk),
+[SeqKit](https://github.com/shenwei356/seqkit), the [FASTX-Toolkit](https://github.com/agordon/fastx_toolkit),
+as well as the more specialized [USEARCH](https://www.drive5.com/usearch) and
+[VSEARCH](https://github.com/torognes/vsearch) offering some of the functions
+as well.
+
+*Seqtool* performs well compared to these tools on a selection of diverse tasks:
+
+**[Comparison of tools](https://markschl.github.io/seqtool-docs/comparison)**
diff --git a/profile/README.md b/profile/README.md
new file mode 100644
index 0000000..d121299
--- /dev/null
+++ b/profile/README.md
@@ -0,0 +1,79 @@
+# Measuring time and memory / comparison to other tools
+
+The following should work for Ubuntu Linux.
+
+```bash
+outdir=target/st_benchmark
+fq=$outdir/reads.fq
+mkdir -p $outdir
+```
+
+## Build the binary
+
+```bash
+cargo build --release
+st=target/release/st
+```
+
+## Download sequencing reads
+
+
+```bash
+wget -qi profile/fastq_urls.txt -O - | zcat > $fq
+ls -lh $fq
+```
+
+## Create temporary storage
+
+We rely on *tmpfs* to store output (and some input) files in memory,
+avoiding disk IO latency as much as possible.
+
+```bash
+rm -Rf $outdir/workdir && mkdir $outdir/workdir
+chmod 777 $outdir/workdir
+sudo mount -t tmpfs -o size=10G none $outdir/workdir
+mkdir -p $outdir/workdir/tmp
+```
+
+Prepare forward primer for searching
+
+```bash
+cat > $outdir/workdir/primers.fasta <<- EOM
+>ITS4
+GTCCTCCGCTTATTGATATGC
+EOM
+```
+
+## Run the benchmarks
+
+Before running, disable frequency boost:
+
+```bash
+echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost
+```
+
+On Ubuntu, disable the indexer for full-text search
+
+```bash
+echo -n > $outdir/workdir/.trackerignore
+```
+
+Run the comparison. The `compare_tools.py` does not only compare runtimes / memory usage,
+but in some cases also validates that the output is the same.
+See `comparison_commands.yaml`.
+
+```bash
+export SEQKIT_THREADS=1
+$st count $fq  # cache the file in memory
+scripts/compare_tools.py \
+    -b $st -d $outdir/workdir -o profile/comparison.json -t $outdir/workdir/tmp \
+    $fq profile/comparison_commands.yaml 
+
+scripts/summarize_comparison.py profile/comparison.json - > profile/comparison.md
+```
+
+## Clean up
+
+```bash
+rm -Rf $outdir/workdir
+```
diff --git a/profile/comparison_commands.yaml b/profile/comparison_commands.yaml
new file mode 100644
index 0000000..2baa426
--- /dev/null
+++ b/profile/comparison_commands.yaml
@@ -0,0 +1,305 @@
+pass:
+  no_action:
+    description: Do nothing, just read and write FASTA
+    cmd: st pass input.fasta > output.fasta
+    other:
+      SeqKit: seqkit seq [-w 0] input.fasta > output.fasta
+    compare_with: [SeqKit]
+  fasta:
+    description: Convert FASTQ to FASTA
+    cmd: st pass --to-fa input.fastq > output.fasta
+    other:
+      FASTX-Toolkit: fastq_to_fasta -Q33 -i input.fastq > output.fasta
+      Seqtk: seqtk seq -A input.fastq > output.fasta
+      SeqKit: seqkit fq2fa input.fastq > output.fasta
+    compare_with: [SeqKit]
+  qual_scores:
+    description: Convert FASTQ quality scores
+    cmd: st pass --to fastq-illumina input.fastq > output.fastq
+    other:
+      VSEARCH: vsearch --fastq_convert input.fastq --fastq_asciiout 64 --fastqout output.fastq
+      SeqKit: seqkit convert --from 'Sanger' --to 'Illumina-1.3+' input.fastq > output.fastq
+    compare_with: [SeqKit]
+  gzip_compress:
+    description: Write compressed FASTQ files in GZIP format
+    cmd: st pass input.fastq -o output.fastq.gz
+    other:
+      SeqKit: seqkit seq input.fastq -o output.fastq.gz
+      seqtool | gzip: st pass input.fastq | gzip -c > output.fastq.gz
+      gzip directly: gzip -kf input.fastq
+      pigz directly (4 threads): pigz -p4 -kf input.fastq
+  compress_zstd:
+    description: Write compressed FASTQ files in Zstandard format
+    cmd: st pass input.fastq -o output.fastq.zst
+    other:
+      seqtool | zstd piped: st pass input.fastq | zstd -c > output.fastq.zst
+  compress_lz4:
+    description: Write compressed FASTQ files in Lz4 format
+    cmd: st pass input.fastq -o output.fastq.lz4
+    other:
+      seqtool | lz4 piped: st pass input.fastq | lz4 -c > output.fastq.lz4
+
+
+count:
+  all:
+    description: Count the number of FASTQ sequences in the input
+    cmd: st count input.fastq
+    other:
+      Seqtk: seqtk size input.fasta
+    compare:
+      seqtk:
+        - st count input.fastq > output.tsv
+        - seqtk size input.fastq | cut -f1 > output.tsv
+  gc:
+    description: Count the number of FASTQ sequences, grouped by GC content (in 10% intervals)
+    cmd: st count -k 'bin(gc_percent, 10)' input.fastq
+    other:
+      st with math expression: st count -k '{bin(gc_percent/100*100, 10)}' input.fastq
+
+
+sort:
+  sequence:
+    description: Sort by sequence
+    cmd: st sort seq input.fasta > output.fasta
+    other:
+      SeqKit: seqkit sort -s [-w 0] input.fasta > output.fasta
+  sequence_lim:
+    description: Sort by sequence with ~ 50 MiB memory limit
+    cmd: st sort seq input.fasta -M 50M > output.fasta
+    other:
+      100 MiB memory limit: st sort seq input.fasta -M 100M > output.fasta
+
+  id:
+    description: Sort by record ID
+    cmd: st sort id input.fasta > output.fasta
+    other:
+      SeqKit: seqkit sort [-w 0] input.fasta > output.fasta
+    compare_with: [SeqKit]
+  length:
+    description: Sort by sequence length
+    cmd: st sort seqlen input.fasta > output.fasta
+    other:
+      SeqKit: seqkit sort -l [-w 0] input.fasta > output.fasta
+      VSEARCH: vsearch --sortbylength input.fasta --output output.fasta [--fasta_width 0]
+  size:
+    description: Sort sequences by USEARCH/VSEARCH-style abundance annotations
+    cmd: >
+      ST_ATTR_FMT=';key=value' st unique seq -a size={n_duplicates} input.fasta |
+        st sort '{-attr("size")}' > output.fasta
+    other:
+      VSEARCH: >
+        vsearch --derep_fulllength input.fasta --output - --sizeout |
+          vsearch --sortbysize - --output output.fasta [--fasta_width 0]
+    compare_with: []
+
+
+unique:
+  seqhash:
+    description: >
+      Remove duplicate sequences using sequence hashes.
+      This is more memory efficient and usually faster than keeping the whole 
+      sequence around.
+    cmd: st unique seqhash input.fasta > output.fasta
+    other:
+      SeqKit: seqkit rmdup -sP [-w 0] input.fasta > output.fasta
+    compare_with: [SeqKit]
+  
+  seqhash_ignorecase:
+    description: >
+      Remove duplicate sequences using sequence hashes (case-insensitive).
+    cmd: st unique 'seqhash(true)' input.fasta > output.fasta
+    other:
+      VSEARCH: vsearch --derep_smallmem input.fasta --fastaout output.fasta [--fasta_width 0]
+      SeqKit: seqkit rmdup -sPi [-w 0] input.fasta > output.fasta
+    compare_with: [SeqKit]
+    compare:
+        vsearch_id_sorted_smallmem:
+        - st unique 'seqhash(true)' input.fasta | st sort id | st del -d > output.fasta
+        - vsearch --derep_smallmem input.fasta --fastaout - | st sort id > output.fasta
+
+  seq:
+    description: >
+      Remove duplicate sequences that are exactly identical (case-insensitive);
+      comparing full sequences instead of not hashes (requires more memory).
+      VSEARCH additionally treats 'T' and 'U' in
+      the same way (seqtool doesn't).
+    cmd: st unique upper_seq input.fasta > output.fasta
+    other:
+      seqtool (sorted by sequence): st unique -s upper_seq input.fasta > output.fasta
+      VSEARCH: vsearch --derep_fulllength input.fasta --output output.fasta [--fasta_width 0]
+    compare:
+      st_mem:
+        - st unique -s seq input.fasta > output.fasta
+        - st unique -s -M 50M seq input.fasta > output.fasta
+      vsearch_id_sorted:
+        - st unique seq input.fasta | st sort id | st del -d > output.fasta
+        - vsearch --derep_fulllength input.fasta --output - | st sort id > output.fasta
+  
+  seq_smallmem:
+    description: Remove duplicate sequences (exact mode) with a memory limit of ~50 MiB
+    cmd: st unique seq -M 50M input.fasta > output.fasta
+
+  seq_both:
+    description: Remove duplicate sequences, checking both strands
+    cmd: st unique seqhash_both input.fasta > output.fasta
+    other:
+      SeqKit: seqkit rmdup -s [-w 0] input.fasta > output.fasta
+    compare_with: [SeqKit]
+
+  size_annot:
+    description: >
+      Remove duplicate sequences, appending USEARCH/VSEARCH-style abundance
+      annotations to the headers: *>id;size=NN*
+    cmd: st unique seq -a size={n_duplicates} --attr-fmt ';key=value' input.fasta > output.fasta
+    other:
+      VSEARCH: vsearch --derep_fulllength input.fasta --sizeout --output output.fasta [--fasta_width 0]
+    compare:
+      vsearch_id_sorted:
+        - st unique seq input.fasta -a size={n_duplicates} --attr-fmt ';key=value' | st del -d | st sort id > output.fasta
+        - vsearch --derep_fulllength input.fasta --output - --sizeout --fasta_width 0 | st sort id > output.fasta
+
+  id_seq:
+    description: >
+      De-replicate both by sequence *and* record ID (the part before the first space in the header).
+      The given benchmark actually has unique sequence IDs, so the result is the same as de-replication
+      by sequence.
+    cmd: st unique id,seq input.fasta > output.fasta
+    other:
+      VSEARCH: vsearch --derep_id input.fasta --output output.fasta
+    compare:
+      vsearch_id_sorted:
+        - st unique id,seq input.fasta | st del -d | st sort id > output.fasta
+        - vsearch --derep_id input.fasta --output - --fasta_width 0 | st sort id > output.fasta
+
+
+filter:
+  seqlen:
+    description: Filter sequences by length
+    cmd: st filter 'seqlen >= 100' input.fastq > output.fastq
+    other:
+      Seqtk: seqtk seq -L 100 input.fastq > output.fastq
+      SeqKit: seqkit seq -m 100 input.fastq > output.fastq
+    compare_with: [Seqtk, SeqKit]
+  quality:
+    description: >
+      Filter sequences by the total expected error as calculated from the
+      quality scores
+    cmd: st filter 'exp_err <= 1' input.fastq --to-fa > output.fastq
+    other:
+      VSEARCH: vsearch --fastq_filter input.fastq --fastq_maxee 1 --fastaout output.fasta [--fasta_width 0]
+      USEARCH: usearch -fastq_filter input.fastq -fastq_maxee 1 -fastaout output.fasta
+    compare_with: [VSEARCH]
+  select_ids:
+    description: Select records from a large set of sequences given a list of 1000 sequence IDs
+    prepare: st sample -n 1000 input.fasta --to-tsv id > ids_list.txt
+    cleanup: rm ids_list.txt
+    cmd: st filter -m ids_list.txt 'has_meta()' input.fasta > output.fasta
+    other:
+      VSEARCH: vsearch --fastx_getseqs input.fasta --labels ids_list.txt --fastaout output.fasta
+      SeqKit: seqkit grep -f ids_list.txt input.fasta > output.fasta
+
+
+sample:
+  n:
+    description: Random subsampling to 1000 of sequences
+    cmd: st sample -n 1000 input.fasta > output.fasta
+    other:
+      VSEARCH: vsearch --fastx_subsample input.fasta --sample_size 1000 --fastaout output.fasta
+      Seqtk: seqtk sample input.fasta 1000 > output.fasta
+      SeqKit: seqkit sample -n 1000 input.fasta > output.fasta
+  proportion:
+    description: Random subsampling to ~10% of sequences
+    cmd: st sample -p 0.1 input.fasta > output.fasta
+    other:
+      Seqtk: seqtk sample input.fastq 0.1 > output.fasta
+      SeqKit: seqkit sample -p 0.1 input.fastq > output.fasta
+
+
+find:
+  primer_location:
+    description: Find the forward primer location in the input reads with up to 4 mismatches
+    cmd: st find -D4 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq
+    other:
+      st (4 threads): st find -t4 -D4 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq
+      st (max. mismatches = 2): st find -D2 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq
+      st (max. mismatches = 8): st find -D8 file:primers.fasta input.fastq -a primer={pattern_name} -a rng={match_range} > output.fastq
+  find_trim_primer:
+    description: >
+      Find and trim the forward primer up to an error rate (edit distance) of 20%,
+      discarding unmatched reads.
+      *Note:* Unlike Cutadapt, seqtool currently does not offer ungapped alignments
+      (`--no-indels`).
+    cmd: >
+      st find -f file:primers.fasta -R 0.2 input.fastq -a primer={pattern_name} -a end={match_end} |
+        st trim -e '{attr(end)}:' --fq > output.fastq
+    other:
+      Cutadapt: >
+        cutadapt -g 'file:primers.fasta;min_overlap=15' input.fastq -e 0.2 --rename '{id} primer={adapter_name}' --discard-untrimmed > output.fastq
+  find_trim_primer_j4:
+    description: >
+      Find and trim the forward primer in parallel using 4 threads (cores).
+    cmd: >
+      st find -f file:primers.fasta -R 0.2 -t4 input.fastq -a primer={pattern_name} -a end={match_end} |
+        st trim -e '{attr(end)}:' --fq > output.fastq
+    other:
+      Cutadapt: >
+        cutadapt -j4 -g 'file:primers.fasta;min_overlap=15' input.fastq -e 0.2 --rename '{id} primer={adapter_name}' --discard-untrimmed > output.fastq
+
+
+replace:
+  dna_to_rna:
+    description: Convert DNA to RNA using the replace command
+    cmd: st replace T U input.fasta > output.fasta
+    other:
+      st find: st find T --rep U input.fasta > output.fasta
+      SeqKit: seqkit seq --dna2rna [-w 0] input.fasta > output.fasta
+      FASTX-Toolkit: fasta_nucleotide_changer -r -i input.fasta > output.fasta
+    compare_with:
+      ['st find', SeqKit]
+  dna_to_rna_4:
+    description: Convert DNA to RNA using 4 threads
+    cmd: st replace -t4 T U input.fasta > output.fasta
+    other:
+      st find: st find -t4 T --rep U input.fasta > output.fasta
+
+
+trim:
+  trim:
+    description: Trim the leading 99 bp from the sequences
+    cmd: "st trim 100: input.fasta > output.fasta"
+    other:
+      SeqKit (creates FASTA index): seqkit subseq -r '100:-1' [-w 0] input.fasta > output.fasta
+    compare_with: [SeqKit (creates FASTA index)]
+
+
+upper:
+  upper:
+    description: Convert sequences to uppercase
+    cmd: st upper input.fasta > output.fasta
+    other:
+      Seqtk: seqtk seq -U input.fasta > output.fasta
+      SeqKit: seqkit seq -u [-w 0] input.fasta > output.fasta
+    compare_with: [Seqtk, SeqKit]
+
+
+revcomp:
+  revcomp:
+    description: Reverse complement sequences
+    cmd: st revcomp input.fasta > output.fasta
+    other:
+      Seqtk: seqtk seq -r input.fasta > output.fasta
+      VSEARCH: vsearch --fastx_revcomp input.fasta --fastaout output.fasta [--fasta_width 0]
+      SeqKit: seqkit seq -rp [-w 0] input.fasta > output.fasta
+    compare_with: [Seqtk, SeqKit, VSEARCH]
+
+
+concat:
+  concat:
+    description: Concatenate sequences, adding an `NNNNN` spacer inbetween
+    prepare:
+      - ln -s input.fastq file1.fastq
+      - ln -s input.fastq file2.fastq
+    cleanup: rm file1.fastq file2.fastq
+    cmd: st concat -s 5 -c N file1.fastq file2.fastq > output.fastq
+    other:
+      VSEARCH: vsearch --fastq_join file1.fastq --reverse file2.fastq --join_padgap NNNNN --fastqout output.fastq
diff --git a/profile/fastq_urls.txt b/profile/fastq_urls.txt
new file mode 100644
index 0000000..8e7ef37
--- /dev/null
+++ b/profile/fastq_urls.txt
@@ -0,0 +1,15 @@
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/079/ERR12573579/ERR12573579_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/096/ERR12551596/ERR12551596_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/003/ERR12551603/ERR12551603_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/005/ERR12551605/ERR12551605_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/009/ERR12551609/ERR12551609_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/091/ERR12551691/ERR12551691_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/094/ERR12551694/ERR12551694_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/095/ERR12551695/ERR12551695_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/097/ERR12551697/ERR12551697_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/099/ERR12551699/ERR12551699_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/002/ERR12551702/ERR12551702_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/004/ERR12551704/ERR12551704_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/007/ERR12551707/ERR12551707_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/011/ERR12551711/ERR12551711_1.fastq.gz
+ftp.sra.ebi.ac.uk/vol1/fastq/ERR125/063/ERR12551763/ERR12551763_1.fastq.gz
diff --git a/scripts/compare_tools.py b/scripts/compare_tools.py
new file mode 100644
index 0000000..9c9b62c
--- /dev/null
+++ b/scripts/compare_tools.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+
+import os
+
+
+def run_benches(binary, input_dir, config, temp_dir=None, kind=None):
+    assert os.path.basename(binary) == 'st', "The binary name must be 'st'"
+    d = os.path.abspath(os.path.dirname(binary))
+    os.environ['PATH'] = d + ":" + os.environ['PATH']
+    # sys.path.insert(0, d)
+    if temp_dir is not None:
+        temp_dir = os.path.relpath(temp_dir, input_dir)
+    os.chdir(input_dir)
+    out = {}
+    for cmd, benches in config.items():
+        cmd_out = out[cmd] = {}
+        for bench, cfg in benches.items():
+            print("bench", bench)
+            cmd_out[bench] = run_bench(bench, cfg, temp_dir=temp_dir, kind=kind)
+    return out
+
+def run_bench(name, cfg, temp_dir=None, kind=None):
+    # print(cfg.get('description', name))
+    # from pprint import pprint; pprint(cfg)
+    st_cmd = cfg['cmd']
+    # do time measurements
+    out = {'description': cfg.get('description', name), 'other': {}}
+    if 'prepare' in cfg:
+        if isinstance(cfg['prepare'], str):
+            cfg['prepare'] = [cfg['prepare']]
+        for cmd in cfg['prepare']:
+            call(cmd, shell=True)
+    if kind is None or 'main' in kind:
+        out['st'], outfile = run_command(st_cmd, temp_dir=temp_dir)
+        if outfile is not None:
+            os.remove(outfile)
+    if kind is None or 'other' in kind:
+        for what, cmd in cfg.get('other', {}).items():
+            # print(what, cmd, len(cmd))
+            out['other'][what], outfile = run_command(cmd, temp_dir=temp_dir)
+            if outfile is not None:
+                os.remove(outfile)
+    # run comparisons
+    if kind is None or 'comparisons' in kind:
+        out1 = 'out1'
+        out2 = 'out2'
+        if 'compare_with' in cfg:
+            _, outfile = run_command(st_cmd, temp_dir=temp_dir)
+            os.rename(outfile, out1)
+            for what in cfg['compare_with']:
+                # print("compare", what)
+                cmd = cfg.get('other', {})[what]
+                _, outfile = run_command(cmd)
+                os.rename(outfile, out2)
+                assert_identical(out1, out2, st_cmd, cmd)
+                os.remove(out2)
+            os.remove(out1)
+        for what, cmds in cfg.get('compare', {}).items():
+            # print("compare other", what)
+            assert len(cmds) == 2
+            _, outfile = run_command(cmds[0], temp_dir=temp_dir)
+            os.rename(outfile, out1)
+            _, outfile = run_command(cmds[1], temp_dir=temp_dir)
+            os.rename(outfile, out2)
+            assert_identical(out1, out2, *cmds)
+            os.remove(out1)
+            os.remove(out2)
+    if 'cleanup' in cfg:
+        if isinstance(cfg['cleanup'], str):
+            cfg['cleanup'] = [cfg['cleanup']]
+            for cmd in cfg['cleanup']:
+                call(cmd, shell=True)
+    return out
+
+def assert_identical(f1, f2, cmd1, cmd2):
+    import subprocess
+    try:
+        subprocess.check_call(["diff", "-q", f1, f2])
+    except subprocess.CalledProcessError:
+        import sys
+        print(
+            "COMPARISON ERROR: command output differs for:\n- {}\n- {}\nCheck with diff {} {}".format(
+                cmd1, cmd2, os.path.abspath(f1),
+                os.path.abspath(f2), os.path.abspath(f2)),
+                file=sys.stderr
+            )
+
+
+def run_command(command, temp_dir=None):
+    import re
+    from tempfile import mkstemp
+    # look for an [[alternative command]] to use instead (appended to command that is displayed)
+    mod_cmd = command
+    m = re.search(r"^(.+?)\[\[(.+?)\]\] *$", mod_cmd)
+    if m is not None:
+        command, mod_cmd = m.groups()
+    # look for individual args that should be used, but not be displayed by default
+    command = re.sub(r' +\[.+?\]', ' ', command)
+    mod_cmd = re.sub(r' +\[(.+?)\]', r' \1', mod_cmd)
+    print(command)
+    if command != mod_cmd:
+        print("[[ {} ]]".format(mod_cmd))
+    _, time_out = mkstemp(prefix="st_time", dir=temp_dir)
+    # for running the command, we use Bash here
+    _cmd = ['/usr/bin/time', '-o', time_out, '-f', r'%e\t%M\t%P', 'bash', '-c', mod_cmd]
+    stdout, stderr = call(_cmd)
+    with open(time_out) as f:
+        # we just use the last line, since the first line will have an error message
+        # if the command fails
+        info = list(f.read().splitlines())[-1].split('\t')
+    os.remove(time_out)
+    result = {
+        'cmd': command,
+        'modified_cmd': None if mod_cmd == command else mod_cmd,
+        'stdout': stdout.decode('utf-8'),
+        'stderr': stderr.decode('utf-8'),
+        'elapsed': float(info[0]),
+        # https://stackoverflow.com/questions/61392725/kilobytes-or-kibibytes-in-gnu-time
+        'max_mib': float(info[1])/1024.,
+        'cpu': float(re.sub('%$', '', info[2])),
+    }
+    # Get output file name
+    m = re.search(r'\b(output(\.\w+)+)( |$)', mod_cmd)
+    outfile = None if m is None else m.group(1)
+    return result, outfile
+
+
+def call(cmd, **kwarg):
+    import subprocess
+    # print("call", cmd)
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwarg)
+    out, err = p.communicate()
+    if p.returncode != 0:
+        import sys
+        print("ERROR: non-zero exit code {} for command:\n{}\nstderr:\n{}".format(p.returncode, cmd, err.decode('utf-8')), file=sys.stderr)
+    return out, err
+
+
+def generate_files(fastq_path, outdir):
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    fq = os.path.join(outdir, 'input.fastq')
+    if os.path.exists(fq):
+        os.remove(fq)
+    os.symlink(os.path.abspath(fastq_path), os.path.abspath(fq))
+    fa = os.path.splitext(fq)[0] + '.fasta'
+    if not os.path.exists(fa):
+        call(['st', '.', '--to-fa', fq, '-o', fa])
+
+
+if __name__ == '__main__':
+    import argparse
+    import yaml
+    import json
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('fastq')
+    parser.add_argument('configfile')
+    parser.add_argument('-o', '--output', type=argparse.FileType('w'), default='-')
+    parser.add_argument('-s', '--selection',
+                        help='comma-separated list of benchmarks to run')
+    parser.add_argument('-b', '--binary', default='target/release/st')
+    parser.add_argument('-d', '--input-dir', default='target/st_benchmark')
+    parser.add_argument('-t', '--temp-dir')
+    parser.add_argument('-k', '--kind', default='main,other,comparisons',
+                        help="Comma-separated list of benchmark types to run. "
+                        "Possible are 'main' (the seqtool commands), 'other' (other tools) "
+                        "and 'comparisons' (comparison of output from different tools). "
+                        "Default is to run all.")
+    args = parser.parse_args()
+
+    generate_files(args.fastq, args.input_dir)
+
+    with open(args.configfile) as f:
+        config = yaml.safe_load(f)
+
+    if args.selection:
+        sel = [s.strip() for s in args.selection.split(',')]
+        config = {k: v for k, v in config.items() if k in sel}
+
+    # print(config)
+    kind = [k.strip() for k in args.kind.split(',')]
+    out = run_benches(args.binary, args.input_dir, config, temp_dir=args.temp_dir, kind=kind)
+
+    json.dump(out, args.output, indent=2)
diff --git a/scripts/summarize_comparison.py b/scripts/summarize_comparison.py
new file mode 100644
index 0000000..72976ad
--- /dev/null
+++ b/scripts/summarize_comparison.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import json
+import re
+
+
+def gen_summary(json_input, md_out):
+    from html import escape
+
+    def fmt_bench(d):
+        return '{}{}<br/>{}'.format(
+            format_time(d),
+            ' {:.0f}% CPU'.format(d['cpu']) if abs(100 - d['cpu']) > 5 else '',
+            format_mem(d),
+        )
+
+    def find_best(d):
+        runs = [d['st']] + list(d.get('other', {}).values())
+        if len(runs) > 1:
+            times = sorted((r['elapsed'], i) for i, r in enumerate(runs))
+            if times[0][0] > 0:
+                runs[times[0][1]]['fastest'] = times[1][0] / times[0][0]
+            mem = sorted((r['max_mib'], i) for i, r in enumerate(runs))
+            if mem[0][0] > 0:
+                runs[mem[0][1]]['lowest_mem'] = mem[1][0] / mem[0][0]
+
+    def format_time(d):
+        f = d.get('fastest')
+        if f is not None:
+            return '🕓 <b>{:.1f} s</b> 🏆 ({:.1f}x)'.format(d['elapsed'], f)
+        return '🕓 {:.1f} s'.format(d['elapsed'])
+
+    def format_mem(d):
+        f = d.get('lowest_mem')
+        if f is not None:
+            return '📈 <b>{:.1f} MiB</b> 🏆 ({:.2f}x)'.format(d['max_mib'], f)
+        return '📈 {:.1f} MiB'.format(d['max_mib'])
+
+    def fmt_output(d):
+        strip_newlines = lambda msg: re.sub(r'(?ms:[\r\n\s]+$)', '', msg)
+        out = ''
+        if d['stdout']:
+            # TODO: get this to work: https://github.com/squidfunk/mkdocs-material/issues/4964
+            out += '<details><summary>🟦 output</summary>\n\n```\n{}\n```\n\n</details>\n'.format(strip_newlines(d['stdout']))
+        if d['stderr']:
+            out += '<details><summary> messages</summary>\n\n```\n{}\n```\n\n</details>\n'.format(strip_newlines(d['stderr']))
+        return out
+
+    for command, comparisons in json.load(json_input).items():
+        md_out.write('## {}\n'.format(command))
+        md_out.write('<table markdown class="cmd">\n\n')
+        for comparison, d in comparisons.items():
+            find_best(d)
+            st = d['st']
+            md_out.write('<tr markdown>\n<td markdown>\n\n{}\n\n</td>\n\n'.format(escape(d.get('description', comparison))))
+            md_out.write('<td markdown>\n\n```bash\n{}\n```\n\n{}\n'.format(st['cmd'], fmt_output(st)))
+            if 'other' in d and d['other']:
+                md_out.write('<details markdown><summary>{}</summary>\n\n<table markdown class="cmd">\n'.format(
+                    "  ❙  ".join('<b>{}</b> {}'.format(
+                        escape(tool), format_time(o),
+                    )
+                    for tool, o in d['other'].items())
+                )
+                )
+                for tool, o in d['other'].items():
+                    code = '<td markdown>\n\n```bash\n{}\n```\n\n{}</td>'.format(
+                        o['cmd'].replace('\n', ' '),
+                        fmt_output(o)
+                    )
+                    md_out.write('\n<tr markdown>\n<td markdown>{}</td>\n\n{}\n\n<td markdown>{}</td>\n\n</tr>\n\n'.format(
+                        escape(tool),
+                        code,
+                        fmt_bench(o)
+                    ))
+                md_out.write('</table>\n\n</details>\n\n')
+            md_out.write('</td>\n\n<td>{}</td>\n\n</tr>\n\n'.format(fmt_bench(st)))
+        md_out.write('</table>\n\n')
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('json_input', type=argparse.FileType('r'))
+    parser.add_argument('md_out', type=argparse.FileType('w'))
+    args = parser.parse_args()
+
+    gen_summary(**vars(args))
diff --git a/src/cli.rs b/src/cli.rs
index 83315e7..aa2e5de 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -332,10 +332,9 @@ pub enum SubCommand {
     /// Sort records by sequence or any other criterion
     #[cfg(any(feature = "all-commands", feature = "sort"))]
     Sort(cmd::sort::cli::SortCommand),
-    /// De-replicate records by sequence or any other criterion, returning only
+    /// De-replicate by sequence and/or other properties, returning only
     /// unique records
     #[cfg(any(feature = "all-commands", feature = "unique"))]
-    #[clap(about, long_about)]
     Unique(cmd::unique::cli::UniqueCommand),
     /// Keep/exclude sequences based on different properties with a
     /// mathematical (JavaScript) expression
diff --git a/src/cmd/count.rs b/src/cmd/count.rs
index d2be0b4..5a6ef9a 100755
--- a/src/cmd/count.rs
+++ b/src/cmd/count.rs
@@ -11,12 +11,9 @@ use crate::helpers::{value::SimpleValue, DefaultHashMap as HashMap};
 use crate::var::varstring::VarString;
 
 pub const DESC: &str = "
-Records in all the provided input (including multiple files) are counted
-collectively.
-
-In addition, records can be summarized over one or more categories
-specified with `-k/--key`. The reported counts are always sorted by the
-numeric or text category.
+The overall record count is returned for all input files collectively.
+Optionally, grouping categories (text or numeric) can be specified using
+`-k/--key`. The tab-delimited output is sorted by the categories.
 ";
 
 lazy_static::lazy_static! {
@@ -49,9 +46,9 @@ pub struct CountCommand {
 
     /// Maximum number of categories to count before aborting with an error.
     /// This limit is a safety measure to prevent memory exhaustion.
-    /// Usually, a very large number of categories is not intended and may
-    /// happen if continuous numbers are not categorized with the
-    /// `bin(<num>, <interval>)` function.
+    /// A very large number of categories could unintentionally occur with a
+    /// condinuous numeric key (e.g. `gc_percent`). These can be grouped into
+    /// regular intervals using `bin(<variable>, <interval>)`.
     #[arg(short = 'l', long, default_value_t = 1000000)]
     category_limit: usize,
 
diff --git a/src/cmd/find/vars.rs b/src/cmd/find/vars.rs
index 8aea1a2..2f40e15 100755
--- a/src/cmd/find/vars.rs
+++ b/src/cmd/find/vars.rs
@@ -22,7 +22,7 @@ variable_enum! {
     ///
     /// # Examples
     ///
-    /// Find a primer sequence with up to 2 mismatches (`-d/--dist``) and write
+    /// Find a primer sequence with up to 2 mismatches (`-d/--dist`) and write
     /// the match range and the mismatches ('dist') to the header as attributes.
     /// The result will be 'undefined' (=undefined in JavaScript) if there are > 2 mismatches
     ///
@@ -32,7 +32,7 @@ variable_enum! {
     /// SEQUENCE
     /// >id2 rng=1:20 dist=0
     /// SEQUENCE
-    /// >id3 rng= dist=
+    /// >id3 rng=undefined dist=undefined
     /// SEQUENCE
     /// (...)
     ///
@@ -41,7 +41,7 @@ variable_enum! {
     /// while non-matching sequences are written to 'no_primer.fasta'
     ///
     /// `st find -f -d 2 CTTGGTCATTTAGAGGAAGTAA --dropped no_primer.fasta -a end={match_end} reads.fasta |
-    ///    st trim -e '{attr(match_end)}..' > primer_trimmed.fasta`
+    ///    st trim -e '{attr(match_end)}:' > primer_trimmed.fasta`
     ///
     ///
     /// Search for several primers with up to 2 mismatches and write the name and mismatches
@@ -53,7 +53,7 @@ variable_enum! {
     /// SEQUENCE
     /// >id1 primer=primer_2 dist=0
     /// SEQUENCE
-    /// >id1 primer= dist=
+    /// >id1 primer=undefined dist=undefined
     /// SEQUENCE
     /// (...)
     FindVar {
@@ -145,7 +145,7 @@ variable_enum! {
         /// to the pattern
         MatchSubst(Number) { hit: String = String::from("1"), pattern: usize = 1 },
         /// Name of the matching pattern (patterns supplied with `file:patterns.fasta`).
-        /// In case a single pattern was specified in the commandline, this will just be `<pattern>`.
+        /// In case a single pattern was specified in the commandline, this will just be *<pattern>*.
         /// `pattern_name(rank)` selects the n-th matching pattern, sorted by edit distance
         /// and/or pattern number (depending on `-D/-R` and `--in-order`).
         PatternName(Text) { rank: usize = 1 },
diff --git a/src/cmd/mask.rs b/src/cmd/mask.rs
index cca7cf9..8057064 100755
--- a/src/cmd/mask.rs
+++ b/src/cmd/mask.rs
@@ -1,13 +1,19 @@
 use clap::Parser;
 
-use crate::cli::CommonArgs;
+use crate::cli::{CommonArgs, WORDY_HELP};
 use crate::config::Config;
 use crate::error::CliResult;
 use crate::helpers::var_range::VarRanges;
 use crate::io::SeqQualRecord;
 
+pub const DESC: &str = "\
+Masks the sequence within a given range or comma delimited list of ranges
+by converting to lowercase (soft mask) or replacing with a character (hard
+masking). Reverting soft masking is also possible.";
+
 #[derive(Parser, Clone, Debug)]
 #[clap(next_help_heading = "'Mask' command options")]
+#[clap(before_help=DESC, help_template=WORDY_HELP)]
 pub struct MaskCommand {
     /// Range in the form 'start:end' or 'start:' or ':end',
     /// The range start/end may be defined by varialbes/functions,
diff --git a/src/cmd/trim.rs b/src/cmd/trim.rs
index ee47799..d0f6ee8 100755
--- a/src/cmd/trim.rs
+++ b/src/cmd/trim.rs
@@ -1,19 +1,13 @@
 use clap::Parser;
 
-use crate::cli::{CommonArgs, WORDY_HELP};
+use crate::cli::CommonArgs;
 use crate::config::Config;
 use crate::error::CliResult;
 use crate::helpers::{rng::Range, var_range::VarRanges};
 use crate::io::{Record, SeqQualRecord};
 
-pub const DESC: &str = "\
-Masks the sequence within a given range or comma delimited list of ranges
-by converting to lowercase (soft mask) or replacing with a character (hard
-masking). Reverting soft masking is also possible.";
-
 #[derive(Parser, Clone, Debug)]
 #[clap(next_help_heading = "'Trim' command options")]
-#[clap(before_help=DESC, help_template=WORDY_HELP)]
 pub struct TrimCommand {
     /// Range(s) in the form 'start:end' or 'start:' or ':end',
     /// Multiple ranges can be supplied as comma-delimited list:
diff --git a/var_provider/src/var_provider.rs b/var_provider/src/var_provider.rs
index e040344..e0a6a78 100644
--- a/var_provider/src/var_provider.rs
+++ b/var_provider/src/var_provider.rs
@@ -139,14 +139,14 @@ pub trait VarProviderInfo: fmt::Debug {
                         // .into_iter()
                         // .map(|u| format!("{}`", u))
                         .join("<br />");
-                    // TODO: very simple but fine for a help page
+                    // TODO: very simple but... but probably fine for a help page
                     let desc = info
                         .description
                         .replace('<', r"\<")
                         .replace('>', r"\>")
                         .replace('\n', "<br />");
 
-                    writeln!(out, "| {} | {} |", usages, desc)?;
+                    writeln!(out, "| <a name=\"{}\"></a>{} | {} |", info.name, usages, desc)?;
                 }
             }
             writeln!(out)?;
@@ -161,7 +161,7 @@ pub trait VarProviderInfo: fmt::Debug {
             writeln!(out, "### {}", ex)?;
             for example in examples {
                 writeln!(out, "{}:", example.description)?;
-                writeln!(out, "```sh\n{}\n```", example.command)?;
+                writeln!(out, "```bash\n{}\n```", example.command)?;
                 if let Some(output) = example.output {
                     writeln!(out, "```\n{}\n```", output)?;
                 }
diff --git a/wix/main.wxs b/wix/main.wxs
index f2c4209..8aa5ab0 100644
--- a/wix/main.wxs
+++ b/wix/main.wxs
@@ -69,7 +69,7 @@
 
         <Package Id='*'
             Keywords='Installer'
-            Description='General purpose tool for reading, modifying and writing biological sequences.'
+            Description='General-purpose tool for reading, modifying and writing biological sequences.'
             Manufacturer='Markus Schlegel'
             InstallerVersion='450'
             Languages='1033'