From 601e86c3a292aacae0bebb06481518a1298a8706 Mon Sep 17 00:00:00 2001 From: Andrey Tovchigrechko Date: Mon, 29 Jan 2018 15:18:35 -0500 Subject: [PATCH 1/3] Option to run subprocess w/o shell --- ariba/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ariba/common.py b/ariba/common.py index 0410334f..a84528df 100644 --- a/ariba/common.py +++ b/ariba/common.py @@ -9,11 +9,11 @@ class Error (Exception): pass -def syscall(cmd, allow_fail=False, verbose=False, verbose_filehandle=sys.stdout, print_errors=True): +def syscall(cmd, allow_fail=False, verbose=False, verbose_filehandle=sys.stdout, print_errors=True, shell=True): if verbose: print('syscall:', cmd, flush=True, file=verbose_filehandle) try: - subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) + subprocess.check_output(cmd, shell=shell, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as error: errors = error.output.decode() if print_errors: From 4fb0392ce99bb6f36c8b4a24e18d2b2bb52b7902 Mon Sep 17 00:00:00 2001 From: Andrey Tovchigrechko Date: Wed, 31 Jan 2018 17:32:06 -0500 Subject: [PATCH 2/3] Remove reference to spades options in tests that are not using spades. Cleanup tmp dirs before copytree in case the test harness is repeated in a directory after previous run has failed with unhandled exceptions. --- ariba/tests/cluster_test.py | 72 +++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index 754c7a54..ea7c078a 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -43,6 +43,7 @@ def test_init_fail_files_missing(self): dirs = [os.path.join(data_dir, d) for d in dirs] for d in dirs: tmpdir = 'tmp.cluster_test_init_fail_files_missing' + shutil.rmtree(tmpdir,ignore_errors=True) shutil.copytree(d, tmpdir) with self.assertRaises(cluster.Error): cluster.Cluster(tmpdir, 'name', refdata=refdata, total_reads=42, total_reads_bases=4242) @@ -100,6 +101,7 @@ def test_full_run_no_reads_after_filtering(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_no_reads_after_filtering' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=0, total_reads_bases=0) @@ -118,9 +120,10 @@ def test_full_run_choose_ref_fail(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_choose_ref_fail' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=2, total_reads_bases=108, spades_other_options='--only-assembler') + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=2, total_reads_bases=108) c.run() expected = '\t'.join(['.', '.', '.', '.', '1024', '2', 'cluster_name'] + ['.'] * 24) @@ -137,9 +140,10 @@ def test_full_run_ref_not_in_cluster(self): refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_ref_not_in_cluster' all_refs_fa = os.path.join(data_dir, 'cluster_test_full_run_ref_not_in_cluster.all_refs.fa') + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ref_not_in_cluster'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=72, total_reads_bases=3600, all_ref_seqs_fasta=all_refs_fa) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=72, total_reads_bases=3600, all_ref_seqs_fasta=all_refs_fa) c.run() expected = '\t'.join(['.', '.', '.', '.', '1024', '72', 'cluster_name'] + ['.'] * 24) @@ -155,6 +159,7 @@ def test_full_run_assembly_fail(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.in.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_assembly_fail' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'), tmpdir) c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=4, total_reads_bases=304) @@ -173,9 +178,10 @@ def test_full_run_ok_non_coding(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.test_full_run_ok_non_coding' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=72, total_reads_bases=3600) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=72, total_reads_bases=3600) c.run() self.maxDiff=None @@ -198,9 +204,10 @@ def test_full_run_ok_presence_absence(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=64, total_reads_bases=3200) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=64, total_reads_bases=3200) c.run() expected = [ @@ -220,9 +227,10 @@ def test_full_run_ok_variants_only_variant_not_present(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300) c.run() expected = [ 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type, so do not report\tGeneric description of variants_only1' @@ -237,9 +245,10 @@ def test_full_run_ok_variants_only_variant_not_present_always_report(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300) c.run() expected = [ 'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1' @@ -254,9 +263,10 @@ def test_full_run_ok_variants_only_variant_is_present(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.present.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300) c.run() expected = [ @@ -273,8 +283,9 @@ def test_full_run_ok_gene_start_mismatch(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=112, total_reads_bases=1080) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=112, total_reads_bases=1080) c.run() expected = [ 'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene' @@ -289,8 +300,9 @@ def test_full_run_smtls_snp_presabs_gene(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_gene' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_gene\tref_gene\t1\t0\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene' @@ -307,8 +319,9 @@ def test_full_run_smtls_snp_varonly_gene_2(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene' @@ -323,8 +336,9 @@ def test_full_run_known_smtls_snp_presabs_gene(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so @@ -342,8 +356,9 @@ def test_full_run_smtls_snp_varonly_gene_no_snp(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_smtls_snp_varonly_gene_no_snp' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_no_snp'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so @@ -361,8 +376,9 @@ def test_full_run_smtls_snp_varonly_gene(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so @@ -380,8 +396,9 @@ def test_full_run_smtls_snp_presabs_nonc(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_smtls_snp_presabs_nonc' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_nonc'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t0\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq' @@ -396,8 +413,9 @@ def test_full_run_smtls_known_snp_presabs_nonc(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_smtls_known_snp_presabs_nonc' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t0\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tG18A\t0\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:0:G18A:.:Description of G18A\tGeneric description of ref_seq' @@ -412,8 +430,9 @@ def test_full_run_smtls_snp_varonly_nonc(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() expected = [ 'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq' @@ -428,8 +447,9 @@ def test_full_run_known_smtls_snp_presabs_nonc(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_noncoding' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_nonc'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so @@ -447,8 +467,9 @@ def test_full_run_smtls_snp_varonly_nonc_no_snp(self): tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc_no_snp'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so @@ -466,8 +487,9 @@ def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320) c.run() # We shouldn't get an extra 'HET' line because we already know about the snp, so @@ -485,8 +507,9 @@ def test_full_run_partial_assembly(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_partial_assembly' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=278, total_reads_bases=15020) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=278, total_reads_bases=15020) c.run() expected = [ @@ -502,8 +525,9 @@ def test_full_run_multiple_vars_in_codon(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_run_multiple_vars' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=292, total_reads_bases=20900) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ @@ -520,8 +544,9 @@ def test_full_run_delete_codon(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_delete_codon.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_delete_codon' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_delete_codon'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=292, total_reads_bases=20900) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ @@ -537,8 +562,9 @@ def test_full_run_insert_codon(self): tsv_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmpdir = 'tmp.cluster_test_full_insert_codon' + shutil.rmtree(tmpdir, ignore_errors=True) shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_insert_codon'), tmpdir) - c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=292, total_reads_bases=20900) + c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900) c.run() expected = [ From e86c9c122fb4d736a7d139f025afd06f0e804669 Mon Sep 17 00:00:00 2001 From: Andrey Tovchigrechko Date: Wed, 31 Jan 2018 18:47:25 -0500 Subject: [PATCH 3/3] Reintroduce Spades as another assembly option. Multithreading for Spades and Bowtie2. Spades improves upon default fermilight on challenging datasets with highly uneven coverage (viral, amplicon, single-cell). Multithreading for Spades and Bowtie2 subprocesses is adaptive - kicks in at the end of a multiprocessing map run when idle threads were appearing otherwise (or in cases of overall fewer clusters than total threads). --- ariba/assembly.py | 114 ++++++++- ariba/cluster.py | 95 ++++--- ariba/clusters.py | 199 +++++++++------ ariba/common.py | 6 +- ariba/external_progs.py | 16 +- ariba/tasks/run.py | 4 +- ariba/tests/assembly_test.py | 48 +++- ...test_assemble_with_spades_fails_reads_1.fq | 4 + ...test_assemble_with_spades_fails_reads_2.fq | 5 + ...embly_test_assemble_with_spades_reads_1.fq | 232 ++++++++++++++++++ ...embly_test_assemble_with_spades_reads_2.fq | 232 ++++++++++++++++++ ...ssembly_test_check_spades_log_file.log.bad | 5 + ...sembly_test_check_spades_log_file.log.good | 5 + scripts/ariba | 6 + 14 files changed, 863 insertions(+), 108 deletions(-) create mode 100644 ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_1.fq create mode 100644 ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_2.fq create mode 100644 ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq create mode 100644 ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq create mode 100644 ariba/tests/data/assembly_test_check_spades_log_file.log.bad create mode 100644 ariba/tests/data/assembly_test_check_spades_log_file.log.good diff --git a/ariba/assembly.py b/ariba/assembly.py index f028a632..0d64882c 100644 --- a/ariba/assembly.py +++ b/ariba/assembly.py @@ -5,6 +5,7 @@ import pymummer import fermilite_ariba from ariba import common, faidx, mapping, bam_parse, external_progs, ref_seq_chooser +import shlex class Error (Exception): pass @@ -29,6 +30,9 @@ def __init__(self, nucmer_breaklen=200, extern_progs=None, clean=True, + spades_mode="wgs", + spades_options=None, + threads=1 ): self.reads1 = os.path.abspath(reads1) self.reads2 = os.path.abspath(reads2) @@ -50,6 +54,9 @@ def __init__(self, self.nucmer_min_len = nucmer_min_len self.nucmer_breaklen = nucmer_breaklen self.clean = clean + self.spades_mode = spades_mode + self.spades_options = spades_options + self.threads = threads if extern_progs is None: self.extern_progs = external_progs.ExternalProgs() @@ -94,6 +101,106 @@ def _assemble_with_fermilite(self): self.assembled_ok = (got_from_fermilite == 0) os.chdir(cwd) + @staticmethod + def _check_spades_log_file(logfile): + '''SPAdes can fail with a strange error. Stop everything if this happens''' + f = pyfastaq.utils.open_file_read(logfile) + + for line in f: + if line.startswith('== Error == system call for:') and line.rstrip().endswith('finished abnormally, err code: -7'): + pyfastaq.utils.close(f) + print('Error running SPAdes. Cannot continue. This is the error from the log file', logfile, '...', file=sys.stderr) + print(line, file=sys.stderr) + raise Error('Fatal error ("err code: -7") running spades. Cannot continue') + + pyfastaq.utils.close(f) + return True + + def _assemble_with_spades(self): + cwd = os.getcwd() + self.assembled_ok = False + try: + try: + os.chdir(self.working_dir) + except: + raise Error('Error chdir ' + self.working_dir) + spades_exe = self.extern_progs.exe('spades') + if not spades_exe: + raise Error("Spades executable has not been found") + spades_options = self.spades_options + if spades_options is not None: + spades_options = shlex.split(self.spades_options) + if self.spades_mode == "rna": + spades_options = ["--rna"] + (["-k","127"] if spades_options is None else spades_options) + spades_out_seq_base = "transcripts.fasta" + elif self.spades_mode == "sc": + spades_options = ["--sc"] + (["-k", "33,55,77,99,127","--careful"] if spades_options is None else spades_options) + spades_out_seq_base = "contigs.fasta" + elif self.spades_mode == "wgs": + spades_options = ["-k", "33,55,77,99,127","--careful"] if spades_options is None else spades_options + spades_out_seq_base = "contigs.fasta" + else: + raise ValueError("Unknown spades_mode value: {}".format(self.spades_mode)) + asm_cmd = [spades_exe, "-t", str(self.threads), "--pe1-1", self.reads1, "--pe1-2", self.reads2, "-o", self.assembler_dir] + \ + spades_options + asm_ok,err = common.syscall(asm_cmd, verbose=True, verbose_filehandle=self.log_fh, shell=False, allow_fail=True) + if not asm_ok: + print('Assembly finished with errors. These are the errors:', file=self.log_fh) + print(err, file=self.log_fh) + print('\nEnd of spades errors\n', file=self.log_fh) + else: + + spades_log = os.path.join(self.assembler_dir, 'spades.log') + if os.path.exists(spades_log): + self._check_spades_log_file(spades_log) + + with open(spades_log) as f: + print('\n______________ SPAdes log ___________________\n', file=self.log_fh) + for line in f: + print(line.rstrip(), file=self.log_fh) + print('\n______________ End of SPAdes log _________________\n', file=self.log_fh) + + spades_warnings = os.path.join(self.assembler_dir, 'warnings.log') + if os.path.exists(spades_warnings): + with open(spades_warnings) as f: + print('\n______________ SPAdes warnings ___________________\n', file=self.log_fh) + for line in f: + print(line.rstrip(), file=self.log_fh) + print('\n______________ End of SPAdes warnings _________________\n', file=self.log_fh) + + ## fermilight module generates contig names that look like `cluster_1.l15.c17.ctg.1` where 'cluster_1'==self.contig_name_prefix + ## the whole structure of the contig name is expected in several places downstream where it is parsed into individual components. + ## For example, it is parsed into to l and c parts in ref_seq_chooser (although the parts are not actually used). + ## This is the code from fermilight module that generates the contig ID string: + ## ofs << ">" << namePrefix << ".l" << overlap << ".c" << minCount << ".ctg." << i + 1 << '\n' + ## + ## We generate the same contig name structure here using dummy values for overlap and minCount, in order + ## to avoid distrupting the downstream code. + ## Note that the fermilight module generates multiple versions of the assembly on a grid of l and c values, + ## and ref_seq_chooser then picks a single "best" (l,c) version based on coverage/identity of the nucmer + ## alignment to the reference. Spades generates a single version of the assembly, so ref_seq_chooser + ## can only pick that one version. + + spades_out_seq = os.path.join(self.assembler_dir,spades_out_seq_base) + ## No need really to use general-purpose pyfastaq.sequences.file_reader here and pay performance cost for + ## its multi-format line tests since we are only replacing the IDs in a pre-defined format + if os.path.exists(spades_out_seq): + with open(spades_out_seq,"r") as inp, open(self.all_assembly_contigs_fa,"w") as out: + pref = self.contig_name_prefix + i_cont = 0 + for line in inp: + if line.startswith(">"): + i_cont += 1 + line = ">{}.l15.c17.ctg.{}\n".format(pref,i_cont) + out.write(line) + if i_cont > 0: + self.assembled_ok = True + if self.clean: + print('Deleting assembly directory', self.assembler_dir, file=self.log_fh) + shutil.rmtree(self.assembler_dir,ignore_errors=True) + finally: + os.chdir(cwd) + @staticmethod def _fix_contig_orientation(contigs_fa, ref_fa, outfile, min_id=90, min_length=20, breaklen=200): @@ -148,7 +255,10 @@ def _parse_bam(sequences, bam, min_scaff_depth, max_insert): def run(self): - self._assemble_with_fermilite() + if self.assembler == 'fermilite': + self._assemble_with_fermilite() + elif self.assembler == "spades": + self._assemble_with_spades() print('Finished running assemblies', flush=True, file=self.log_fh) self.sequences = {} @@ -208,7 +318,7 @@ def run(self): self.reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], - threads=1, + threads=self.threads, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_version=self.extern_progs.version('bowtie2'), diff --git a/ariba/cluster.py b/ariba/cluster.py index 75d244b8..9d733f8d 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -43,10 +43,12 @@ def __init__(self, max_allele_freq=0.90, unique_threshold=0.03, max_gene_nt_extend=30, - spades_other_options=None, + spades_mode="rna", #["rna","wgs"] + spades_options=None, clean=True, extern_progs=None, random_seed=42, + threads_total=1 ): self.root_dir = os.path.abspath(root_dir) self.read_store = read_store @@ -71,7 +73,8 @@ def __init__(self, self.sspace_k = sspace_k self.sspace_sd = sspace_sd self.reads_insert = reads_insert - self.spades_other_options = spades_other_options + self.spades_mode = spades_mode + self.spades_options = spades_options self.reads_for_assembly1 = os.path.join(self.root_dir, 'reads_for_assembly_1.fq') self.reads_for_assembly2 = os.path.join(self.root_dir, 'reads_for_assembly_2.fq') @@ -96,6 +99,9 @@ def __init__(self, self.status_flag = flag.Flag() self.clean = clean + self.threads_total = threads_total + self.remaining_clusters = None + self.assembly_dir = os.path.join(self.root_dir, 'Assembly') self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa') self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam') @@ -138,13 +144,31 @@ def __init__(self, for s in wanted_signals: signal.signal(s, self._receive_signal) + def _update_threads(self): + """Update available thread count post-construction. + To be called any number of times from run() method""" + if self.remaining_clusters is not None: + self.threads = max(1,self.threads_total//self.remaining_clusters.value) + #otherwise just keep the current (initial) value + print("{} detected {} threads available to it".format(self.name,self.threads), file = self.log_fh) + + def _report_completion(self): + """Update shared counters to signal that we are done with this cluster. + Call just before exiting run() method (in a finally clause)""" + rem_clust = self.remaining_clusters + if rem_clust is not None: + # -= is non-atomic, need to acquire a lock + with self.remaining_clusters_lock: + rem_clust.value -= 1 + # we do not need this object anymore + self.remaining_clusters = None + print("{} reported completion".format(self.name), file=self.log_fh) def _atexit(self): if self.log_fh is not None: pyfastaq.utils.close(self.log_fh) self.log_fh = None - def _receive_signal(self, signum, stack): print('Signal', signum, 'received in cluster', self.name + '... Stopping!', file=sys.stderr, flush=True) if self.log_fh is not None: @@ -190,7 +214,10 @@ def _set_up_input_files(self): def _clean_file(self, filename): if self.clean: print('Deleting file', filename, file=self.log_fh) - os.unlink(filename) + try: #protect against OSError: [Errno 16] Device or resource busy: '.nfs0000000010f0f04f000003c9' and such + os.unlink(filename) + except: + pass def _clean(self): @@ -269,33 +296,39 @@ def _make_reads_for_assembly(number_of_wanted_reads, total_reads, reads_in1, rea return total_reads - def run(self): - self._set_up_input_files() + def run(self,remaining_clusters=None,remaining_clusters_lock=None): + try: + self.remaining_clusters = remaining_clusters + self.remaining_clusters_lock = remaining_clusters_lock + self._update_threads() + self._set_up_input_files() - for fname in [self.all_reads1, self.all_reads2, self.references_fa]: - if not os.path.exists(fname): - raise Error('File ' + fname + ' not found. Cannot continue') + for fname in [self.all_reads1, self.all_reads2, self.references_fa]: + if not os.path.exists(fname): + raise Error('File ' + fname + ' not found. Cannot continue') - original_dir = os.getcwd() - os.chdir(self.root_dir) + original_dir = os.getcwd() + os.chdir(self.root_dir) + + try: + self._run() + except Error as err: + os.chdir(original_dir) + print('Error running cluster! Error was:', err, sep='\n', file=self.log_fh) + pyfastaq.utils.close(self.log_fh) + self.log_fh = None + raise Error('Error running cluster ' + self.name + '!') - try: - self._run() - except Error as err: os.chdir(original_dir) - print('Error running cluster! Error was:', err, sep='\n', file=self.log_fh) + print('Finished', file=self.log_fh, flush=True) + print('{:_^79}'.format(' LOG FILE END ' + self.name + ' '), file=self.log_fh, flush=True) + + # This stops multiprocessing complaining with the error: + # multiprocessing.pool.MaybeEncodingError: Error sending result: '[]'. Reason: 'TypeError("cannot serialize '_io.TextIOWrapper' object",)' pyfastaq.utils.close(self.log_fh) self.log_fh = None - raise Error('Error running cluster ' + self.name + '!') - - os.chdir(original_dir) - print('Finished', file=self.log_fh, flush=True) - print('{:_^79}'.format(' LOG FILE END ' + self.name + ' '), file=self.log_fh, flush=True) - - # This stops multiprocessing complaining with the error: - # multiprocessing.pool.MaybeEncodingError: Error sending result: '[]'. Reason: 'TypeError("cannot serialize '_io.TextIOWrapper' object",)' - pyfastaq.utils.close(self.log_fh) - self.log_fh = None + finally: + self._report_completion() def _run(self): @@ -310,6 +343,7 @@ def _run(self): print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True) print('Assembling reads:', file=self.log_fh, flush=True) + self._update_threads() self.assembly = assembly.Assembly( self.reads_for_assembly1, self.reads_for_assembly2, @@ -323,7 +357,10 @@ def _run(self): contig_name_prefix=self.name, assembler=self.assembler, extern_progs=self.extern_progs, - clean=self.clean + clean=self.clean, + spades_mode=self.spades_mode, + spades_options=self.spades_options, + threads=self.threads ) self.assembly.run() @@ -332,7 +369,7 @@ def _run(self): self._clean_file(self.reads_for_assembly2) if self.clean: print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True) - shutil.rmtree(self.assembly_dir) + shutil.rmtree(self.assembly_dir,ignore_errors=True) if self.assembled_ok and self.assembly.ref_seq_name is not None: @@ -342,13 +379,13 @@ def _run(self): self.is_variant_only = '1' if is_variant_only else '0' print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True) - + self._update_threads() mapping.run_bowtie2( self.all_reads1, self.all_reads2, self.final_assembly_fa, self.final_assembly_bam[:-4], - threads=1, + threads=self.threads, sort=True, bowtie2=self.extern_progs.exe('bowtie2'), bowtie2_preset='very-sensitive-local', diff --git a/ariba/clusters.py b/ariba/clusters.py index ae50cd56..60cdd714 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -14,8 +14,11 @@ class Error (Exception): pass - -def _run_cluster(obj, verbose, clean, fails_dir): +# passing shared objects (remaining_clusters) through here and thus making them +# explicit arguments to Pool.startmap when running this function. That seems to be +# a recommended safe transfer mechanism as opposed making them attributes of a +# pre-constructed 'obj' variable (although the docs are a bit hazy on that) +def _run_cluster(obj, verbose, clean, fails_dir, remaining_clusters, remaining_clusters_lock): failed_clusters = os.listdir(fails_dir) if len(failed_clusters) > 0: @@ -25,7 +28,7 @@ def _run_cluster(obj, verbose, clean, fails_dir): if verbose: print('Start running cluster', obj.name, 'in directory', obj.root_dir, flush=True) try: - obj.run() + obj.run(remaining_clusters=remaining_clusters,remaining_clusters_lock=remaining_clusters_lock) except: print('Failed cluster:', obj.name, file=sys.stderr) with open(os.path.join(fails_dir, obj.name), 'w'): @@ -38,7 +41,10 @@ def _run_cluster(obj, verbose, clean, fails_dir): if verbose: print('Deleting cluster dir', obj.root_dir, flush=True) if os.path.exists(obj.root_dir): - shutil.rmtree(obj.root_dir) + try: + shutil.rmtree(obj.root_dir) + except: + pass return obj @@ -56,7 +62,8 @@ def __init__(self, threads=1, verbose=False, assembler='fermilite', - spades_other=None, + spades_mode='rna', + spades_options=None, max_insert=1000, min_scaff_depth=10, nucmer_min_id=90, @@ -86,10 +93,10 @@ def __init__(self, self.logs_dir = os.path.join(self.outdir, 'Logs') self.assembler = assembler - assert self.assembler in ['fermilite'] self.assembly_kmer = assembly_kmer self.assembly_coverage = assembly_coverage - self.spades_other = spades_other + self.spades_mode = spades_mode + self.spades_options = spades_options self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit') self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa' @@ -135,7 +142,6 @@ def __init__(self, os.mkdir(d) except: raise Error('Error mkdir ' + d) - if tmp_dir is None: if 'ARIBA_TMPDIR' in os.environ: tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR']) @@ -373,6 +379,28 @@ def _init_and_run_clusters(self): cluster_list = [] self.log_files = [] + # How the thread count withing each Cluster.run is managed: + # We want to handle those cases where there are more total threads allocated to the application than there are clusters + # remaining to run (for example, + # there are only two references, and eight threads). If we keep the default thread value of 1 in cluster. Cluster, + # then we will be wasting the allocated threads. The most simple approach would be to divide all threads equally between clusters + # before calling Pool.map. Multithreaded external programs like Spades and Bowtie2 are then called with multiple threads. That should + # never be slower than keeping just one thread in cluster.Cluster, except maybe in the extreme cases when (if) + # a multi-threaded run of the external program takes longer wall-clock time than a single-threaded one. + # However, this solution would always keep + # Cluster.threads=1 if the initial number of clusters > number of total threads. This can result in inefficiency at the + # tail of the Pool.map execution flow - when the clusters are getting finished overall, we are waiting for the completion of + # fewer and fewer remaining + # single-threaded cluster tasks while more and more total threads are staying idle. We mitigate this through the following approach: + # - Create a shared Value object that holds the number of remaining clusters (remaining_clusters). + # - Each Cluster.run decrements the remaining_clusters when it completes + # - Cluster.run sets its own thread count to max(1,threads_total//remaining_clusters). This can be done as many times + # as needed at various points within Cluster.run (e.g. once before Spades is called, and again before Bowtie2 is called), + # in order to catch more idle threads. + # This is a simple and conservative approach to adaptively use all threads at the tail of the map flow. It + # never over-subscribes the threads, and it does not require any extra blocking within Cluster.run in order to + # wait for threads becoming available. + for cluster_name in sorted(self.cluster_to_dir): counter += 1 @@ -406,25 +434,56 @@ def _init_and_run_clusters(self): reads_insert=self.insert_size, sspace_k=self.min_scaff_depth, sspace_sd=self.insert_sspace_sd, - threads=1, # clusters now run in parallel, so this should always be 1! + threads=1, # initially set to 1, then will adaptively self-modify while running assembled_threshold=self.assembled_threshold, unique_threshold=self.unique_threshold, max_gene_nt_extend=self.max_gene_nt_extend, - spades_other_options=self.spades_other, + spades_mode=self.spades_mode, + spades_options=self.spades_options, clean=self.clean, extern_progs=self.extern_progs, + threads_total=self.threads )) - + # Here is why we use proxy objects from a Manager process below + # instead of simple shared multiprocessing.Value counter: + # Shared memory objects in multiprocessing use tempfile module to + # create temporary directory, then create temporary file inside it, + # memmap the file and unlink it. If TMPDIR envar points to a NFS + # mount, the final cleanup handler from multiprocessing will often + # return an exception due to a stale NFS file (.nfsxxxx) from a shutil.rmtree + # call. See help on tempfile.gettempdir() for how the default location of + # temporary files is selected. The exception is caught in except clause + # inside multiprocessing cleanup, and only a harmless traceback is printed, + # but it looks very spooky to the user and causes confusion. We use + # instead shared proxies from the Manager. Those do not rely on shared + # memory, and thus bypass the NFS issues. The counter is accesses infrequently + # relative to computations, so the performance does not suffer. + # default authkey in the manager will be some generated random-looking string + manager = multiprocessing.Manager() + remaining_clusters = manager.Value('l',len(cluster_list)) + # manager.Value does not provide access to the internal RLock that we need for + # implementing atomic -=, so we need to carry around a separate RLock object. + remaining_clusters_lock = manager.RLock() try: if self.threads > 1: self.pool = multiprocessing.Pool(self.threads) - cluster_list = self.pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose), itertools.repeat(self.clean), itertools.repeat(self.fails_dir))) + cluster_list = self.pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose), itertools.repeat(self.clean), itertools.repeat(self.fails_dir), + itertools.repeat(remaining_clusters),itertools.repeat(remaining_clusters_lock))) + # harvest the pool as soon as we no longer need it + self.pool.close() + self.pool.join() else: for c in cluster_list: - _run_cluster(c, self.verbose, self.clean, self.fails_dir) + _run_cluster(c, self.verbose, self.clean, self.fails_dir, remaining_clusters, remaining_clusters_lock) except: self.clusters_all_ran_ok = False + if self.verbose: + print('Final value of remaining_clusters counter:', remaining_clusters) + remaining_clusters = None + remaining_clusters_lock = None + manager.shutdown() + if len(os.listdir(self.fails_dir)) > 0: self.clusters_all_ran_ok = False @@ -498,7 +557,7 @@ def _write_catted_genes_matching_refs_fasta(self, outfile): def _clean(self): if self.clean: - shutil.rmtree(self.fails_dir) + shutil.rmtree(self.fails_dir,ignore_errors=True) try: self.tmp_dir_obj.cleanup() @@ -507,10 +566,7 @@ def _clean(self): if self.verbose: print('Deleting Logs directory', self.logs_dir) - try: - shutil.rmtree(self.logs_dir) - except: - pass + shutil.rmtree(self.logs_dir,ignore_errors=True) try: if self.verbose: @@ -551,67 +607,68 @@ def run(self): def _run(self): cwd = os.getcwd() - os.chdir(self.outdir) - self.write_versions_file(cwd) - self._map_and_cluster_reads() - self.log_files = None - - if len(self.cluster_to_dir) > 0: - got_insert_data_ok = self._set_insert_size_data() - if not got_insert_data_ok: - print('WARNING: not enough proper read pairs (found ' + str(self.proper_pairs) + ') to determine insert size.', file=sys.stderr) - print('This probably means that very few reads were mapped at all. No local assemblies will be run', file=sys.stderr) - if self.verbose: - print('Not enough proper read pairs mapped to determine insert size. Skipping all assemblies.', flush=True) + try: + os.chdir(self.outdir) + self.write_versions_file(cwd) + self._map_and_cluster_reads() + self.log_files = None + + if len(self.cluster_to_dir) > 0: + got_insert_data_ok = self._set_insert_size_data() + if not got_insert_data_ok: + print('WARNING: not enough proper read pairs (found ' + str(self.proper_pairs) + ') to determine insert size.', file=sys.stderr) + print('This probably means that very few reads were mapped at all. No local assemblies will be run', file=sys.stderr) + if self.verbose: + print('Not enough proper read pairs mapped to determine insert size. Skipping all assemblies.', flush=True) + else: + if self.verbose: + print('{:_^79}'.format(' Assembling each cluster ')) + print('Will run', self.threads, 'cluster(s) in parallel', flush=True) + self._init_and_run_clusters() + if self.verbose: + print('Finished assembling clusters\n') else: if self.verbose: - print('{:_^79}'.format(' Assembling each cluster ')) - print('Will run', self.threads, 'cluster(s) in parallel', flush=True) - self._init_and_run_clusters() - if self.verbose: - print('Finished assembling clusters\n') - else: - if self.verbose: - print('No reads mapped. Skipping all assemblies', flush=True) - print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr) + print('No reads mapped. Skipping all assemblies', flush=True) + print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr) - if not self.clusters_all_ran_ok: - raise Error('At least one cluster failed! Stopping...') + if not self.clusters_all_ran_ok: + raise Error('At least one cluster failed! Stopping...') - if self.verbose: - print('{:_^79}'.format(' Writing reports '), flush=True) - print('Making', self.report_file_all_tsv) - self._write_report(self.clusters, self.report_file_all_tsv) + if self.verbose: + print('{:_^79}'.format(' Writing reports '), flush=True) + print('Making', self.report_file_all_tsv) + self._write_report(self.clusters, self.report_file_all_tsv) - if self.verbose: - print('Making', self.report_file_filtered) - rf = report_filter.ReportFilter(infile=self.report_file_all_tsv) - rf.run(self.report_file_filtered) + if self.verbose: + print('Making', self.report_file_filtered) + rf = report_filter.ReportFilter(infile=self.report_file_all_tsv) + rf.run(self.report_file_filtered) - if self.verbose: - print() - print('{:_^79}'.format(' Writing fasta of assembled sequences '), flush=True) - print(self.catted_assembled_seqs_fasta, 'and', self.catted_genes_matching_refs_fasta, flush=True) - self._write_catted_assembled_seqs_fasta(self.catted_assembled_seqs_fasta) - self._write_catted_genes_matching_refs_fasta(self.catted_genes_matching_refs_fasta) - self._write_catted_assemblies_fasta(self.catted_assemblies_fasta) - - if self.log_files is not None: - clusters_log_file = os.path.join(self.outdir, 'log.clusters.gz') if self.verbose: print() - print('{:_^79}'.format(' Catting cluster log files '), flush=True) - print('Writing file', clusters_log_file, flush=True) - common.cat_files(self.log_files, clusters_log_file) - - if self.verbose: - print() - print('{:_^79}'.format(' Cleaning files '), flush=True) - self._clean() + print('{:_^79}'.format(' Writing fasta of assembled sequences '), flush=True) + print(self.catted_assembled_seqs_fasta, 'and', self.catted_genes_matching_refs_fasta, flush=True) + self._write_catted_assembled_seqs_fasta(self.catted_assembled_seqs_fasta) + self._write_catted_genes_matching_refs_fasta(self.catted_genes_matching_refs_fasta) + self._write_catted_assemblies_fasta(self.catted_assemblies_fasta) + + if self.log_files is not None: + clusters_log_file = os.path.join(self.outdir, 'log.clusters.gz') + if self.verbose: + print() + print('{:_^79}'.format(' Catting cluster log files '), flush=True) + print('Writing file', clusters_log_file, flush=True) + common.cat_files(self.log_files, clusters_log_file) - Clusters._write_mlst_reports(self.mlst_profile_file, self.report_file_filtered, self.mlst_reports_prefix, verbose=self.verbose) + if self.verbose: + print() + print('{:_^79}'.format(' Cleaning files '), flush=True) + self._clean() - if self.clusters_all_ran_ok and self.verbose: - print('\nAll done!\n') + Clusters._write_mlst_reports(self.mlst_profile_file, self.report_file_filtered, self.mlst_reports_prefix, verbose=self.verbose) - os.chdir(cwd) + if self.clusters_all_ran_ok and self.verbose: + print('\nAll done!\n') + finally: + os.chdir(cwd) diff --git a/ariba/common.py b/ariba/common.py index a84528df..92ab3c92 100644 --- a/ariba/common.py +++ b/ariba/common.py @@ -12,6 +12,8 @@ class Error (Exception): pass def syscall(cmd, allow_fail=False, verbose=False, verbose_filehandle=sys.stdout, print_errors=True, shell=True): if verbose: print('syscall:', cmd, flush=True, file=verbose_filehandle) + if not shell: + print('syscall string:', " ".join('"{}"'.format(_) for _ in cmd), flush=True, file=verbose_filehandle) try: subprocess.check_output(cmd, shell=shell, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as error: @@ -26,7 +28,9 @@ def syscall(cmd, allow_fail=False, verbose=False, verbose_filehandle=sys.stdout, return False, errors else: sys.exit(1) - + except Exception as msg: + print("Unexpected exception: ", msg, file=sys.stderr) + raise return True, None diff --git a/ariba/external_progs.py b/ariba/external_progs.py index dbbe0775..dfcd4e64 100644 --- a/ariba/external_progs.py +++ b/ariba/external_progs.py @@ -13,6 +13,7 @@ class Error (Exception): pass 'bowtie2': 'bowtie2', 'cdhit': 'cd-hit-est', 'nucmer' : 'nucmer', + 'spades' : 'spades.py' } @@ -23,6 +24,7 @@ class Error (Exception): pass 'bowtie2': ('--version', re.compile('.*bowtie2.*version (.*)$')), 'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')), 'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')), + 'spades': ('--version', re.compile('SPAdes\s+v([0-9\.]+)')) } @@ -30,8 +32,12 @@ class Error (Exception): pass 'bowtie2': '2.1.0', 'cdhit': '4.6', 'nucmer': '3.1', + 'spades': '3.11.0' } +prog_optional = set([ + 'spades' +]) class ExternalProgs: def __init__(self, verbose=False, fail_on_error=True): @@ -47,11 +53,15 @@ def __init__(self, verbose=False, fail_on_error=True): warnings = [] for prog in sorted(prog_to_default): + msg_sink = errors + if prog in prog_optional: + msg_sink = warnings + prog_exe = self._get_exe(prog) self.progs[prog] = shutil.which(prog_exe) if self.progs[prog] is None: - errors.append(prog + ' not found in path. Looked for ' + prog_exe) + msg_sink.append(prog + ' not found in path. Looked for ' + prog_exe) self.version_report.append('\t'.join([prog, 'NA', 'NOT_FOUND'])) if verbose: @@ -63,10 +73,10 @@ def __init__(self, verbose=False, fail_on_error=True): if got_version: self.versions[prog] = version if prog in min_versions and LooseVersion(version) < LooseVersion(min_versions[prog]): - errors.append(' '.join(['Found version', version, 'of', prog, 'which is too low! Please update to at least', min_versions[prog] + '. Found it here:', prog_exe])) + msg_sink.append(' '.join(['Found version', version, 'of', prog, 'which is too low! Please update to at least', min_versions[prog] + '. Found it here:', prog_exe])) else: self.versions[prog] = None - errors.append(version) + msg_sink.append(version) version = 'ERROR' self.version_report.append('\t'.join([prog, version, self.progs[prog]])) diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index db3bd324..75f484fd 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -47,7 +47,7 @@ def run(options): extern_progs, version_report_lines=version_report_lines, assembly_coverage=options.assembly_cov, - assembler='fermilite', + assembler=options.assembler, threads=options.threads, verbose=options.verbose, min_scaff_depth=options.min_scaff_depth, @@ -59,6 +59,8 @@ def run(options): max_gene_nt_extend=options.gene_nt_extend, clean=(not options.noclean), tmp_dir=options.tmp_dir, + spades_mode=options.spades_mode, + spades_options=options.spades_options ) c.run() diff --git a/ariba/tests/assembly_test.py b/ariba/tests/assembly_test.py index f33f2c52..06368493 100644 --- a/ariba/tests/assembly_test.py +++ b/ariba/tests/assembly_test.py @@ -5,10 +5,11 @@ import filecmp import pyfastaq from ariba import assembly +from ariba import external_progs modules_dir = os.path.dirname(os.path.abspath(assembly.__file__)) data_dir = os.path.join(modules_dir, 'tests', 'data') - +extern_progs = external_progs.ExternalProgs() class TestAssembly(unittest.TestCase): def test_run_fermilite(self): @@ -102,3 +103,48 @@ def test_parse_bam(self): os.unlink(bam + '.unmapped_mates') os.unlink(bam + '.scaff') + def test_check_spades_log_file(self): + '''test _check_spades_log_file''' + good_file = os.path.join(data_dir, 'assembly_test_check_spades_log_file.log.good') + bad_file = os.path.join(data_dir, 'assembly_test_check_spades_log_file.log.bad') + self.assertTrue(assembly.Assembly._check_spades_log_file(good_file)) + with self.assertRaises(assembly.Error): + self.assertTrue(assembly.Assembly._check_spades_log_file(bad_file)) + + @unittest.skipUnless(extern_progs.exe('spades'), "Spades assembler is optional and is not configured") + def test_assemble_with_spades(self): + '''test _assemble_with_spades''' + reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq') + reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq') + tmp_dir = 'tmp.test_assemble_with_spades' + tmp_log = 'tmp.test_assemble_with_spades.log' + with open(tmp_log, 'w') as tmp_log_fh: + print('First line', file=tmp_log_fh) + shutil.rmtree(tmp_dir, ignore_errors=True) + #using spades_options=" --only-assembler" because error correction cannot determine quality offset on this + #artificial dataset + a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed', tmp_dir, 'not_needed_for_this_test.fa', + 'not_needed_for_this_test.bam', tmp_log_fh, 'not needed', + assembler="spades", spades_options=" --only-assembler") + a._assemble_with_spades() + self.assertTrue(a.assembled_ok) + shutil.rmtree(tmp_dir,ignore_errors=True) + os.unlink(tmp_log) + + @unittest.skipUnless(extern_progs.exe('spades'), "Spades assembler is optional and is not configured") + def test_assemble_with_spades_fail(self): + '''test _assemble_with_spades handles spades fail''' + reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_fails_reads_1.fq') + reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_fails_reads_2.fq') + tmp_dir = 'tmp.test_assemble_with_spades_fail' + tmp_log = 'tmp.test_assemble_with_spades_fail.log' + with open(tmp_log, 'w') as tmp_log_fh: + print('First line', file=tmp_log_fh) + shutil.rmtree(tmp_dir, ignore_errors=True) + a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed', tmp_dir, 'not_needed_for_this_test.fa', + 'not_needed_for_this_test.bam', tmp_log_fh, 'not needed', + assembler="spades", spades_options=" --only-assembler") + a._assemble_with_spades() + self.assertFalse(a.assembled_ok) + shutil.rmtree(tmp_dir,ignore_errors=True) + os.unlink(tmp_log) diff --git a/ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_1.fq b/ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_1.fq new file mode 100644 index 00000000..15574dde --- /dev/null +++ b/ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_1.fq @@ -0,0 +1,4 @@ +@read1/1 +CACGTTCGTCGTGATGACTGACGTCACGAGCTCTGCGTACGTCATCTAGCGTATCGTACTGACTGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_2.fq b/ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_2.fq new file mode 100644 index 00000000..4274d1d6 --- /dev/null +++ b/ariba/tests/data/assembly_test_assemble_with_spades_fails_reads_2.fq @@ -0,0 +1,5 @@ +@read1/2 +CACGTTCGTCGTGATGACTGACGTCACGAGCTCTGCGTACGTCATCTAGCGTATCGTACTGACTGAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII + diff --git a/ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq b/ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq new file mode 100644 index 00000000..c367e21f --- /dev/null +++ b/ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq @@ -0,0 +1,232 @@ +@1:1:82:186/1 +GCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:6:109/1 +GGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:1:106/1 +CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:33:136/1 +TGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:196:299/1 +CCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTTTAGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:63:168/1 +CAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:10:111/1 +TTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:74:178/1 +AGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:84:186/1 +TCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:1:41:144/1 +CCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:2:144:247/1 +ATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:3:225:329/1 +CTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:4:237:340/1 +GCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:5:45:151/1 +GACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:6:284:386/1 +CGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:7:305:407/1 +CTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:8:213:317/1 +CTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:9:183:287/1 +GATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:10:289:393/1 +TCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:11:296:399/1 +GTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:12:270:373/1 +GTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:13:167:271/1 +CTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:14:43:147/1 +TTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:15:323:424/1 +TCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:16:105:207/1 +AGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:17:237:341/1 +GCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:18:3:107/1 +CGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:19:272:374/1 +GTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:20:251:354/1 +GTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:21:95:199/1 +CTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:22:96:199/1 +TCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:23:94:197/1 +ACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:24:185:289/1 +TTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:25:152:256/1 +CGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:26:285:389/1 +GTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:27:137:241/1 +TGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:28:261:365/1 +GGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:29:12:116/1 +AGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:30:107:210/1 +CTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:31:162:266/1 +AGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:32:213:317.dup.2/1 +CTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:33:24:127/1 +TGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:34:84:189/1 +TCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:35:40:145/1 +CCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:36:120:223/1 +TGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:37:106:211/1 +GCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:38:98:202/1 +TGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:39:72:177/1 +AAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:40:16:120/1 +TGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:41:308:410/1 +GAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:42:26:129/1 +CCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:43:130:234/1 +CAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:44:52:157/1 +TCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:45:220:323/1 +GCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:46:125:228/1 +TCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:47:210:314/1 +TGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:48:220:324/1 +GCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:49:25:128/1 +GCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCATAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq b/ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq new file mode 100644 index 00000000..14747f67 --- /dev/null +++ b/ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq @@ -0,0 +1,232 @@ +@1:1:82:186/2 +CCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACCTATA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:2:6:109/2 +TCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:3:1:106/2 +TCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:4:33:136/2 +CAATGGGAGTAGAAGGTCAACCTATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:5:196:299/2 +TTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:6:63:168/2 +CTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACCTATAATCGTCAGAGCACGATGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:7:10:111/2 +AATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:8:74:178/2 +GACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACCTATAATCGTCAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@1:9:84:186/2 +CCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACCTATA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:1:41:144/2 +GTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:2:144:247/2 +CGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:3:225:329/2 +TGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:4:237:340/2 +GGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:5:45:151/2 +AGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:6:284:386/2 +GGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:7:305:407/2 +CTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:8:213:317/2 +AGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:9:183:287/2 +GCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:10:289:393/2 +CATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:11:296:399/2 +TCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:12:270:373/2 +CCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:13:167:271/2 +TGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:14:43:147/2 +AGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:15:323:424/2 +CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:16:105:207/2 +GTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:17:237:341/2 +AGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:18:3:107/2 +GTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:19:272:374/2 +GCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:20:251:354/2 +TTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:21:95:199/2 +AACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:22:96:199/2 +AACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:23:94:197/2 +CACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:24:185:289/2 +AAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:25:152:256/2 +GAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:26:285:389/2 +GTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:27:137:241/2 +TACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:28:261:365/2 +GCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:29:12:116/2 +CATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:30:107:210/2 +CGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:31:162:266/2 +GAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:32:213:317.dup.2/2 +AGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:33:24:127/2 +TAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:34:84:189/2 +CGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:35:40:145/2 +CGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:36:120:223/2 +CACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:37:106:211/2 +ACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:38:98:202/2 +CGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:39:72:177/2 +ACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:40:16:120/2 +TCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:41:308:410/2 +AGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:42:26:129/2 +AGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:43:130:234/2 +TCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:44:52:157/2 +GACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:45:220:323/2 +AGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGA ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:46:125:228/2 +ATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:47:210:314/2 +AGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTAC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:48:220:324/2 +TAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@ref2:49:25:128/2 +GTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/assembly_test_check_spades_log_file.log.bad b/ariba/tests/data/assembly_test_check_spades_log_file.log.bad new file mode 100644 index 00000000..df881624 --- /dev/null +++ b/ariba/tests/data/assembly_test_check_spades_log_file.log.bad @@ -0,0 +1,5 @@ +line 1 +line 2 + +== Error == system call for: "['/foo/bar/SPAdes-3.6.0-Linux/bin/spades', '/spam/eggs/K21/configs/config.info']" finished abnormally, err code: -7 + diff --git a/ariba/tests/data/assembly_test_check_spades_log_file.log.good b/ariba/tests/data/assembly_test_check_spades_log_file.log.good new file mode 100644 index 00000000..f0f06879 --- /dev/null +++ b/ariba/tests/data/assembly_test_check_spades_log_file.log.good @@ -0,0 +1,5 @@ +This is a dummy spades log file. + +It doesn't look like a real spades log file. + +But it doesn't have lines matching the bad stuff that will mean ariba should stop NOW. diff --git a/scripts/ariba b/scripts/ariba index 498dddd0..8cf4a51c 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -222,8 +222,14 @@ nucmer_group.add_argument('--nucmer_min_len', type=int, help='Minimum alignment nucmer_group.add_argument('--nucmer_breaklen', type=int, help='Value to use for -breaklen when running nucmer [%(default)s]', default=200, metavar='INT') assembly_group = subparser_run.add_argument_group('Assembly options') +assembly_group.add_argument('--assembler', help='Assembler to use', choices=['fermilite','spades'], default='fermilite') assembly_group.add_argument('--assembly_cov', type=int, help='Target read coverage when sampling reads for assembly [%(default)s]', default=50, metavar='INT') assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs [%(default)s]', default=10, metavar='INT') +assembly_group.add_argument('--spades_mode', help='If using Spades assembler, either use default WGS mode, Single Cell mode (`spades.py --sc`) or RNA mode (`spades.py --rna`). ' + 'Use SC or RNA mode if your input is from a viral sequencing with very uneven and deep coverage. ' + 'Set `--assembly_cov` to some high value if using SC or RNA mode', choices=['wgs','sc','rna'], default='wgs') +assembly_group.add_argument('--spades_options', help='Extra options to pass to Spades assembler. Sensible default options will be picked based on `--spades_mode` argument. ' + 'Anything set here will replace the defaults completely') other_run_group = subparser_run.add_argument_group('Other options') other_run_group.add_argument('--threads', type=int, help='Experimental. Number of threads. Will run clusters in parallel, but not minimap (yet) [%(default)s]', default=1, metavar='INT')