From fb547bfc8d4db450997e7e32123be05e4f4bfba6 Mon Sep 17 00:00:00 2001
From: Hayden Metsky <hmetsky@gmail.com>
Date: Mon, 15 Jan 2024 16:33:33 -0500
Subject: [PATCH] Clarify certain help messages

---
 bin/design.py | 65 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 19 deletions(-)

diff --git a/bin/design.py b/bin/design.py
index 332b9cbf..86714818 100755
--- a/bin/design.py
+++ b/bin/design.py
@@ -757,15 +757,19 @@ def check_cluster_and_design_separately(val):
         help=("(Optional) If set, cluster all input sequences using their "
               "MinHash signatures, design probes separately on each cluster, "
               "and combine the resulting probes. This can significantly lower "
-              "runtime and memory usage, but may lead to a suboptimal "
+              "runtime and memory usage, but may lead to a worse "
               "solution. The value CLUSTER_AND_DESIGN_SEPARATELY gives the "
               "distance threshold for determining clusters in terms of "
               "average nucleotide dissimilarity (1-ANI, where ANI is "
               "average nucleotide identity; see --cluster-and-design-"
-              "separately-method for details); higher values "
-              "result in fewer clusters, and thus longer runtime. Values "
+              "separately-method for details on clustering methods); higher "
+              "values result in fewer clusters, and thus longer runtime. Values "
               "must be in (0,0.5], and generally should be around 0.1 to "
-              "0.2. When used, this creates a separate genome for each "
+              "0.2; in general, we recommend 0.15 because, with probe-target "
+              "divergences typically desired in practice, it is reasonable "
+              "to design probes independently on clusters of sequences "
+              "determined at this threshold. When used, this option creates "
+              "a separate genome for each "
               "input sequence -- it collapses all sequences, across both "
               "groups and genomes, into one list of sequences in one group. "
               "Therefore, genomes will not be grouped as specified in the "
@@ -780,7 +784,7 @@ def check_cluster_and_design_separately(val):
               "if their estimated nucleotide dissimilarity is within "
               "the value CLUSTER_AND_DESIGN_SEPARATELY. If 'hierarchical', "
               "clusters are determined by agglomerative hierarchical "
-              "clustering and the the value CLUSTER_AND_DESIGN_SEPARATELY "
+              "clustering and the value CLUSTER_AND_DESIGN_SEPARATELY "
               "is the inter-cluster distance threshold to merge clusters. "
               "If 'choose', use a heuristic to decide among 'simple' and "
               "'hierarchical' based on the input. This option can affect "
@@ -795,9 +799,20 @@ def check_cluster_and_design_separately(val):
               "length CLUSTER_FROM_FRAGMENTS nt, and cluster these fragments. "
               "This can be useful for improving runtime on input with "
               "especially large genomes, in which probes for different "
-              "fragments can be designed separately. Values should generally "
-              "be around 50,000. For this to be used, "
-              "--cluster-and-design-separately must also be set."))
+              "fragments can be designed independently. The fragment length "
+              "must balance a trade-off between (a) yielding too many "
+              "fragments (owing to a short fragment length), which would slow "
+              "clustering and potentially lead to outputs that are worse "
+              "(e.g., in terms of number of probes); and (b) yielding too few "
+              "fragments (owing to a long fragment length), which negates the "
+              "benefit of this argument in speeding design on large genomes. "
+              "In practice, lengths of around 50,000 nt achieves a reasonable "
+              "balance, i.e., setting the value to 50000 is a reasonable "
+              "recommendation in practice. For this option to be used, "
+              "--cluster-and-design-separately must also be set because "
+              "this argument tells CATCH to proceed with clustering as "
+              "described for that argument, except using fragments rather "
+              "than whole input sequences."))
 
     # Filter candidate probes with LSH
     parser.add_argument('--filter-with-lsh-hamming',
@@ -807,11 +822,15 @@ def check_cluster_and_design_separately(val):
               "works with Hamming distance. FILTER_WITH_LSH_HAMMING gives "
               "the maximum Hamming distance at which to call near-"
               "duplicates; it should be commensurate with (but not greater "
-              "than) MISMATCHES. Using this may significantly improve "
+              "than) MISMATCHES. Values equal to MISMATCHES minus 1 or 2 "
+              "are reasonable for near-duplicate detection; for example, "
+              "if MISMATCHES is 5, a reasonable value is 3 or 4. "
+              "Using this may significantly improve "
               "runtime and reduce memory usage by reducing the number of "
               "candidate probes to consider, but may lead to a slightly "
-              "sub-optimal solution. It may also, particularly with "
-              "relatively high values of FILTER_WITH_LSH_HAMMING, cause "
+              "worse solution. It may also, particularly with "
+              "values of FILTER_WITH_LSH_HAMMING that are similar or "
+              "equal to MISMATCHES, cause "
               "coverage obtained for each genome to be slightly less than "
               "the desired coverage (COVERAGE) when that desired coverage "
               "is the complete genome; using --print-analysis or "
@@ -832,24 +851,32 @@ def check_filter_with_lsh_minhash(val):
               "duplicates using LSH with a MinHash family. "
               "FILTER_WITH_LSH_MINHASH gives the maximum Jaccard distance "
               "(1 minus Jaccard similarity) at which to call near-duplicates; "
-              "the Jaccard similarity is calculated by treating each probe "
-              "as a set of overlapping 10-mers. Its value should be "
+              "the Jaccard similarity between two probes is calculated by "
+              "treating each probe as a set of overlapping 10-mers and "
+              "comparing the two sets. Its value should be "
               "commensurate with parameter values determining whether a probe "
-              "hybridizes to a target sequence, but this can be difficult "
-              "to measure compared to the input for --filter-with-lsh-hamming. "
-              "This argument allows more sensitivity in near-duplicate "
+              "hybridizes to a target sequence. With the default hybridization "
+              "model using -m MISMATCHES, let the probe-target divergence D be "
+              "MISMATCHES divided by PROBE_LENGTH; FILTER_WITH_LSH_MINHASH "
+              "should be, at most, roughly [1 - 1/(2*e^(10*D) - 1)] (see Ondov "
+              "et al. 2016 and solve Eq. 4 for 1-j with k=10). "
+              "This value can be difficult to determine compared to the value "
+              "for --filter-with-lsh-hamming, but "
+              "this argument allows more sensitivity in near-duplicate "
               "detection than --filter-with-lsh-hamming (e.g., if near-"
               "duplicates should involve probes shifted relative to each "
               "other) and, therefore, greater improvement in runtime and "
-              "memory usage. Values should generally be around 0.5 to 0.7. "
+              "memory usage. Values should generally be around 0.5 to 0.7, "
+              "which correspond to reasonable and typically-used "
+              "probe-target divergences. "
               "The same caveat mentioned in the help message for "
               "--filter-with-lsh-hamming also applies here; namely, it can "
               "cause the coverage obtained for each genome to be slightly "
               "less than the desired coverage (COVERAGE), and especially so "
               "with low values of MISMATCHES (~0, 1, or 2). Values of "
               "FILTER_WITH_LSH_MINHASH above ~0.7 may start to require "
-              "significant memory and runtime for near-duplicate detection "
-              "and are usually not recommended."))
+              "significant memory and runtime for near-duplicate detection, "
+              "and should not be needed in practice."))
 
     # Miscellaneous technical adjustments
     parser.add_argument('--small-seq-skip',