nextstrain · victorlin · Jan 28, 2023 · Jan 23, 2023 · Jan 27, 2023
diff --git a/docs/contribute/DEV_DOCS.md b/docs/contribute/DEV_DOCS.md
@@ -125,6 +125,13 @@ To compare JSON outputs with stochastic numerical values, use `scripts/diff_json
 
 Both tree and JSON comparison scripts rely on [deepdiff](https://deepdiff.readthedocs.io/en/latest/) for underlying comparisons.
 
+##### Tips for writing Cram tests
+
+Cram is a simple testing framework that is also very versatile. Over time, we have changed the way we design and organize Augur's Cram tests. You might find older practices in existing tests that haven't been updated yet, but these are the latest guidelines that we've discovered to be helpful.
+
+1. Keep cram files modular. This makes it easier to see which command is failing.
+2. Create files in the initial working directory, as it is a temporary working directory unique to the test. Note that the name of the `$TMP` directory is misleading - although it is temporary, it is shared across all tests so you'll have to explicitly remove files at the end of each test to avoid affecting other tests. The initial directory of each test is a unique directory within `$TMP`.
+
 #### Running Tests
 
 You've written tests and now you want to run them to see if they are passing.

diff --git a/tests/functional/filter/cram/_setup.sh b/tests/functional/filter/cram/_setup.sh
@@ -1,2 +1 @@
-pushd "$TESTDIR/../../" > /dev/null
-export AUGUR="${AUGUR:-../../bin/augur}"
+export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
diff --git a/tests/functional/filter/cram/filter-exclude-include.t b/tests/functional/filter/cram/filter-exclude-include.t
@@ -1,17 +1,15 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Filter with exclude query for two regions that comprise all but one strain.
 This filter should leave a single record from Oceania.
 Force include one South American record by country to get two total records.
 
   $ ${AUGUR} filter \
-  >  --metadata filter/data/metadata.tsv \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
   >  --include-where "country=Ecuador" \
-  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
-  $ wc -l "$TMP/filtered_strains.txt"
+  >  --output-strains filtered_strains.txt > /dev/null
+  $ wc -l filtered_strains.txt
   \s*2 .* (re)
-  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-force-include-no-duplicates.t b/tests/functional/filter/cram/filter-force-include-no-duplicates.t
@@ -1,22 +1,21 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 
 Test that a force-included strain is only output once.
 
 
 Create some files for testing.
 
-  $ cat >$TMP/metadata.tsv <<~~
+  $ cat >metadata.tsv <<~~
   > strain	col
   > a	1
   > b	2
   > c	3
   > d	4
   > ~~
-  $ cat >$TMP/sequences.fasta <<~~
+  $ cat >sequences.fasta <<~~
   > >a
   > NNNN
   > >b
@@ -30,26 +29,26 @@ Create some files for testing.
 Test all outputs with --include-where.
 
   $ ${AUGUR} filter \
-  >   --metadata $TMP/metadata.tsv \
-  >   --sequences $TMP/sequences.fasta \
+  >   --metadata metadata.tsv \
+  >   --sequences sequences.fasta \
   >   --subsample-max-sequences 4 \
   >   --include-where col=1 \
   >   --subsample-seed 0 \
-  >   --output-metadata $TMP/metadata-filtered.tsv \
-  >   --output-strains $TMP/strains-filtered.txt \
-  >   --output-sequences $TMP/sequences-filtered.fasta \
+  >   --output-metadata metadata-filtered.tsv \
+  >   --output-strains strains-filtered.txt \
+  >   --output-sequences sequences-filtered.fasta \
   >   > /dev/null 2>&1
-  $ cat $TMP/metadata-filtered.tsv | tail -n+2 | sort -k1
+  $ cat metadata-filtered.tsv | tail -n+2 | sort -k1
   a\t1 (esc)
   b\t2 (esc)
   c\t3 (esc)
   d\t4 (esc)
-  $ cat $TMP/strains-filtered.txt | sort
+  $ cat strains-filtered.txt | sort
   a
   b
   c
   d
-  $ cat $TMP/sequences-filtered.fasta
+  $ cat sequences-filtered.fasta
   >a
   NNNN
   >b
@@ -61,30 +60,30 @@ Test all outputs with --include-where.
 
 Test all outputs with --include.
 
-  $ cat >$TMP/include.txt <<~~
+  $ cat >include.txt <<~~
   > a
   > ~~
   $ ${AUGUR} filter \
-  >   --metadata $TMP/metadata.tsv \
-  >   --sequences $TMP/sequences.fasta \
+  >   --metadata metadata.tsv \
+  >   --sequences sequences.fasta \
   >   --subsample-max-sequences 4 \
-  >   --include $TMP/include.txt \
+  >   --include include.txt \
   >   --subsample-seed 0 \
-  >   --output-metadata $TMP/metadata-filtered.tsv \
-  >   --output-strains $TMP/strains-filtered.txt \
-  >   --output-sequences $TMP/sequences-filtered.fasta \
+  >   --output-metadata metadata-filtered.tsv \
+  >   --output-strains strains-filtered.txt \
+  >   --output-sequences sequences-filtered.fasta \
   >   > /dev/null 2>&1
-  $ cat $TMP/metadata-filtered.tsv | tail -n+2 | sort -k1
+  $ cat metadata-filtered.tsv | tail -n+2 | sort -k1
   a\t1 (esc)
   b\t2 (esc)
   c\t3 (esc)
   d\t4 (esc)
-  $ cat $TMP/strains-filtered.txt | sort
+  $ cat strains-filtered.txt | sort
   a
   b
   c
   d
-  $ cat $TMP/sequences-filtered.fasta
+  $ cat sequences-filtered.fasta
   >a
   NNNN
   >b

diff --git a/tests/functional/filter/cram/filter-metadata-duplicates-error.t b/tests/functional/filter/cram/filter-metadata-duplicates-error.t
@@ -1,11 +1,10 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Error on duplicates in metadata within same chunk.
 
-  $ cat >$TMP/metadata-duplicates.tsv <<~~
+  $ cat >metadata-duplicates.tsv <<~~
   > strain	date
   > a	2010-10-10
   > a	2010-10-10
@@ -14,31 +13,31 @@ Error on duplicates in metadata within same chunk.
   > d	2010-10-10
   > ~~
   $ ${AUGUR} filter \
-  >   --metadata $TMP/metadata-duplicates.tsv \
+  >   --metadata metadata-duplicates.tsv \
   >   --group-by year \
   >   --sequences-per-group 2 \
   >   --subsample-seed 0 \
   >   --metadata-chunk-size 10 \
-  >   --output-metadata $TMP/metadata-filtered.tsv > /dev/null
+  >   --output-metadata metadata-filtered.tsv > /dev/null
   ERROR: The following strains are duplicated in .* (re)
   a
   [2]
-  $ cat $TMP/metadata-filtered.tsv
+  $ cat metadata-filtered.tsv
   cat: .*: No such file or directory (re)
   [1]
 
 Error on duplicates in metadata in separate chunks.
 
   $ ${AUGUR} filter \
-  >   --metadata $TMP/metadata-duplicates.tsv \
+  >   --metadata metadata-duplicates.tsv \
   >   --group-by year \
   >   --sequences-per-group 2 \
   >   --subsample-seed 0 \
   >   --metadata-chunk-size 1 \
-  >   --output-metadata $TMP/metadata-filtered.tsv > /dev/null
+  >   --output-metadata metadata-filtered.tsv > /dev/null
   ERROR: The following strains are duplicated in .* (re)
   a
   [2]
-  $ cat $TMP/metadata-filtered.tsv
+  $ cat metadata-filtered.tsv
   cat: .*: No such file or directory (re)
   [1]
diff --git a/tests/functional/filter/cram/filter-metadata-not-found-error.t b/tests/functional/filter/cram/filter-metadata-not-found-error.t
@@ -1,14 +1,13 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Try to filter on an metadata file that does not exist.
 
   $ ${AUGUR} filter \
   >  --metadata file-does-not-exist.tsv \
   >  --group-by year month \
   >  --sequences-per-group 1 \
-  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  >  --output-strains filtered_strains.txt > /dev/null
   ERROR: No such file or directory: 'file-does-not-exist.tsv'
   [2]
diff --git a/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t b/tests/functional/filter/cram/filter-metadata-sequence-strains-mismatch.t
@@ -1,7 +1,6 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Confirm that filtering omits strains without metadata or sequences.
 The input sequences are missing one strain that is in the metadata.
@@ -10,20 +9,25 @@ The list of strains to include has one strain with no metadata/sequence and one
 The query initially filters 3 strains from Colombia, one of which is added back by the include.
 
   $ ${AUGUR} filter \
-  >  --sequence-index filter/data/sequence_index.tsv \
-  >  --metadata filter/data/metadata.tsv \
+  >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --query "country != 'Colombia'" \
   >  --non-nucleotide \
   >  --exclude-ambiguous-dates-by year \
-  >  --include filter/data/include.txt \
-  >  --output-strains "$TMP/filtered_strains.txt" \
-  >  --output-log "$TMP/filtered_log.tsv"
+  >  --include "$TESTDIR/../data/include.txt" \
+  >  --output-strains filtered_strains.txt \
+  >  --output-log filtered_log.tsv
   4 strains were dropped during filtering
   \t1 had no metadata (esc)
   \t1 had no sequence data (esc)
   \t3 of these were filtered out by the query: "country != 'Colombia'" (esc)
-  \t1 strains were added back because they were in filter/data/include.txt (esc)
+  \\t1 strains were added back because they were in .*include\.txt.* (re)
   9 strains passed all filters
 
-  $ diff -u <(sort -k 1,1 filter/data/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
-  $ rm -f "$TMP/filtered_strains.txt"
+  $ (head -n1; sort -k 1,1) < filtered_log.tsv
+  strain	filter	kwargs
+  COL/FLR_00008/2015	filter_by_query	"[[""query"", ""country != 'Colombia'""]]"
+  COL/FLR_00008/2015\tforce_include_strains\t"[[""include_file"", ""*/data/include.txt""]]" (esc) (glob)
+  COL/FLR_00024/2015	filter_by_query	"[[""query"", ""country != 'Colombia'""]]"
+  Colombia/2016/ZC204Se	filter_by_query	"[[""query"", ""country != 'Colombia'""]]"
+  HND/2016/HU_ME59	filter_by_sequence_index	[]
diff --git a/tests/functional/filter/cram/filter-min-date.t b/tests/functional/filter/cram/filter-min-date.t
@@ -1,13 +1,11 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Filter using only metadata without a sequence index.
 This should work because the requested filters don't rely on sequence information.
 
   $ ${AUGUR} filter \
-  >  --metadata filter/data/metadata.tsv \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --min-date 2012 \
-  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
-  $ rm -f "$TMP/filtered_strains.txt"
+  >  --output-strains filtered_strains.txt > /dev/null
diff --git a/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t b/tests/functional/filter/cram/filter-min-length-no-sequence-index-error.t
@@ -1,14 +1,13 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Try to filter using only metadata without a sequence index.
 This should fail because the requested filters rely on sequence information.
 
   $ ${AUGUR} filter \
-  >  --metadata filter/data/metadata.tsv \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --min-length 10000 \
-  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  >  --output-strains filtered_strains.txt > /dev/null
   ERROR: You need to provide a sequence index or sequences to filter on sequence-specific information.
   [2]
diff --git a/tests/functional/filter/cram/filter-min-length-output-metadata.t b/tests/functional/filter/cram/filter-min-length-output-metadata.t
@@ -1,19 +1,17 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Filter using only metadata without sequence input or output and save results as filtered metadata.
 
   $ ${AUGUR} filter \
-  >  --sequence-index filter/data/sequence_index.tsv \
-  >  --metadata filter/data/metadata.tsv \
+  >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --min-date 2012 \
   >  --min-length 10500 \
-  >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
+  >  --output-metadata filtered_metadata.tsv > /dev/null
 
 Output should include the 8 sequences matching the filters and a header line.
 
-  $ wc -l "$TMP/filtered_metadata.tsv"
+  $ wc -l filtered_metadata.tsv
   \s*9 .* (re)
-  $ rm -f "$TMP/filtered_metadata.tsv"
diff --git a/tests/functional/filter/cram/filter-min-length-output-strains.t b/tests/functional/filter/cram/filter-min-length-output-strains.t
@@ -1,19 +1,17 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Filter using only metadata and save results as a list of filtered strains.
 
   $ ${AUGUR} filter \
-  >  --sequence-index filter/data/sequence_index.tsv \
-  >  --metadata filter/data/metadata.tsv \
+  >  --sequence-index "$TESTDIR/../data/sequence_index.tsv" \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --min-date 2012 \
   >  --min-length 10500 \
-  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  >  --output-strains filtered_strains.txt > /dev/null
 
 Output should include only the 8 sequences matching the filters (without a header line).
 
-  $ wc -l "$TMP/filtered_strains.txt"
+  $ wc -l filtered_strains.txt
   \s*8 .* (re)
-  $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/cram/filter-min-max-date-output.t b/tests/functional/filter/cram/filter-min-max-date-output.t
@@ -1,15 +1,14 @@
 Setup
 
-  $ pushd "$TESTDIR" > /dev/null
-  $ source _setup.sh
+  $ source "$TESTDIR"/_setup.sh
 
 Check output of min/max date filters.
 
   $ ${AUGUR} filter \
-  >  --metadata filter/data/metadata.tsv \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --min-date 2015-01-01 \
   >  --max-date 2016-02-01 \
-  >  --output-metadata "$TMP/filtered_metadata.tsv"
+  >  --output-metadata filtered_metadata.tsv
   8 strains were dropped during filtering
   \t1 of these were dropped because they were earlier than 2015.0 or missing a date (esc)
   \t7 of these were dropped because they were later than 2016.09 or missing a date (esc)