From fb547bfc8d4db450997e7e32123be05e4f4bfba6 Mon Sep 17 00:00:00 2001 From: Hayden Metsky Date: Mon, 15 Jan 2024 16:33:33 -0500 Subject: [PATCH] Clarify certain help messages --- bin/design.py | 65 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/bin/design.py b/bin/design.py index 332b9cbf..86714818 100755 --- a/bin/design.py +++ b/bin/design.py @@ -757,15 +757,19 @@ def check_cluster_and_design_separately(val): help=("(Optional) If set, cluster all input sequences using their " "MinHash signatures, design probes separately on each cluster, " "and combine the resulting probes. This can significantly lower " - "runtime and memory usage, but may lead to a suboptimal " + "runtime and memory usage, but may lead to a worse " "solution. The value CLUSTER_AND_DESIGN_SEPARATELY gives the " "distance threshold for determining clusters in terms of " "average nucleotide dissimilarity (1-ANI, where ANI is " "average nucleotide identity; see --cluster-and-design-" - "separately-method for details); higher values " - "result in fewer clusters, and thus longer runtime. Values " + "separately-method for details on clustering methods); higher " + "values result in fewer clusters, and thus longer runtime. Values " "must be in (0,0.5], and generally should be around 0.1 to " - "0.2. When used, this creates a separate genome for each " + "0.2; in general, we recommend 0.15 because, with probe-target " + "divergences typically desired in practice, it is reasonable " + "to design probes independently on clusters of sequences " + "determined at this threshold. When used, this option creates " + "a separate genome for each " "input sequence -- it collapses all sequences, across both " "groups and genomes, into one list of sequences in one group. " "Therefore, genomes will not be grouped as specified in the " @@ -780,7 +784,7 @@ def check_cluster_and_design_separately(val): "if their estimated nucleotide dissimilarity is within " "the value CLUSTER_AND_DESIGN_SEPARATELY. If 'hierarchical', " "clusters are determined by agglomerative hierarchical " - "clustering and the the value CLUSTER_AND_DESIGN_SEPARATELY " + "clustering and the value CLUSTER_AND_DESIGN_SEPARATELY " "is the inter-cluster distance threshold to merge clusters. " "If 'choose', use a heuristic to decide among 'simple' and " "'hierarchical' based on the input. This option can affect " @@ -795,9 +799,20 @@ def check_cluster_and_design_separately(val): "length CLUSTER_FROM_FRAGMENTS nt, and cluster these fragments. " "This can be useful for improving runtime on input with " "especially large genomes, in which probes for different " - "fragments can be designed separately. Values should generally " - "be around 50,000. For this to be used, " - "--cluster-and-design-separately must also be set.")) + "fragments can be designed independently. The fragment length " + "must balance a trade-off between (a) yielding too many " + "fragments (owing to a short fragment length), which would slow " + "clustering and potentially lead to outputs that are worse " + "(e.g., in terms of number of probes); and (b) yielding too few " + "fragments (owing to a long fragment length), which negates the " + "benefit of this argument in speeding design on large genomes. " + "In practice, lengths of around 50,000 nt achieves a reasonable " + "balance, i.e., setting the value to 50000 is a reasonable " + "recommendation in practice. For this option to be used, " + "--cluster-and-design-separately must also be set because " + "this argument tells CATCH to proceed with clustering as " + "described for that argument, except using fragments rather " + "than whole input sequences.")) # Filter candidate probes with LSH parser.add_argument('--filter-with-lsh-hamming', @@ -807,11 +822,15 @@ def check_cluster_and_design_separately(val): "works with Hamming distance. FILTER_WITH_LSH_HAMMING gives " "the maximum Hamming distance at which to call near-" "duplicates; it should be commensurate with (but not greater " - "than) MISMATCHES. Using this may significantly improve " + "than) MISMATCHES. Values equal to MISMATCHES minus 1 or 2 " + "are reasonable for near-duplicate detection; for example, " + "if MISMATCHES is 5, a reasonable value is 3 or 4. " + "Using this may significantly improve " "runtime and reduce memory usage by reducing the number of " "candidate probes to consider, but may lead to a slightly " - "sub-optimal solution. It may also, particularly with " - "relatively high values of FILTER_WITH_LSH_HAMMING, cause " + "worse solution. It may also, particularly with " + "values of FILTER_WITH_LSH_HAMMING that are similar or " + "equal to MISMATCHES, cause " "coverage obtained for each genome to be slightly less than " "the desired coverage (COVERAGE) when that desired coverage " "is the complete genome; using --print-analysis or " @@ -832,24 +851,32 @@ def check_filter_with_lsh_minhash(val): "duplicates using LSH with a MinHash family. " "FILTER_WITH_LSH_MINHASH gives the maximum Jaccard distance " "(1 minus Jaccard similarity) at which to call near-duplicates; " - "the Jaccard similarity is calculated by treating each probe " - "as a set of overlapping 10-mers. Its value should be " + "the Jaccard similarity between two probes is calculated by " + "treating each probe as a set of overlapping 10-mers and " + "comparing the two sets. Its value should be " "commensurate with parameter values determining whether a probe " - "hybridizes to a target sequence, but this can be difficult " - "to measure compared to the input for --filter-with-lsh-hamming. " - "This argument allows more sensitivity in near-duplicate " + "hybridizes to a target sequence. With the default hybridization " + "model using -m MISMATCHES, let the probe-target divergence D be " + "MISMATCHES divided by PROBE_LENGTH; FILTER_WITH_LSH_MINHASH " + "should be, at most, roughly [1 - 1/(2*e^(10*D) - 1)] (see Ondov " + "et al. 2016 and solve Eq. 4 for 1-j with k=10). " + "This value can be difficult to determine compared to the value " + "for --filter-with-lsh-hamming, but " + "this argument allows more sensitivity in near-duplicate " "detection than --filter-with-lsh-hamming (e.g., if near-" "duplicates should involve probes shifted relative to each " "other) and, therefore, greater improvement in runtime and " - "memory usage. Values should generally be around 0.5 to 0.7. " + "memory usage. Values should generally be around 0.5 to 0.7, " + "which correspond to reasonable and typically-used " + "probe-target divergences. " "The same caveat mentioned in the help message for " "--filter-with-lsh-hamming also applies here; namely, it can " "cause the coverage obtained for each genome to be slightly " "less than the desired coverage (COVERAGE), and especially so " "with low values of MISMATCHES (~0, 1, or 2). Values of " "FILTER_WITH_LSH_MINHASH above ~0.7 may start to require " - "significant memory and runtime for near-duplicate detection " - "and are usually not recommended.")) + "significant memory and runtime for near-duplicate detection, " + "and should not be needed in practice.")) # Miscellaneous technical adjustments parser.add_argument('--small-seq-skip',