From e0a5c1d1d4ec5e1e25a7fc93f4e250a5bab07a6b Mon Sep 17 00:00:00 2001 From: Jeffrey Barrick Date: Tue, 25 May 2021 17:16:43 -0500 Subject: [PATCH] Speed up and bug fix related to muliple base substitutions in a codon --- ChangeLog | 4 + configure.ac | 2 +- src/c/breseq/breseq_cmdline.cpp | 267 ++++++++++-------- src/c/breseq/genome_diff.cpp | 6 +- src/c/breseq/genome_diff_entry.cpp | 8 +- src/c/breseq/libbreseq/common.h | 4 +- src/c/breseq/libbreseq/genome_diff.h | 2 +- src/c/breseq/libbreseq/genome_diff_entry.h | 4 +- src/c/breseq/libbreseq/settings.h | 6 + src/c/breseq/reference_sequence.cpp | 146 +++++++--- src/c/breseq/settings.cpp | 6 +- tests/gdtools_convert_2/expected.json | 24 ++ .../expected.gd | 2 +- 13 files changed, 311 insertions(+), 170 deletions(-) diff --git a/ChangeLog b/ChangeLog index a81c2c3f..d2d84c5c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Version 0.35.7 + * Speedup and fix of rare bug related to annotating mutations to properly account for cases + in which multiple base substitutions occur in the same codon. + Version 0.35.6 * Should be compatible with GenBank reference files produced by Prokka and NCBI PGAP, and with GFF3 files produced by PGAP. diff --git a/configure.ac b/configure.ac index 8a39299d..304f11be 100644 --- a/configure.ac +++ b/configure.ac @@ -19,7 +19,7 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. AC_PREREQ([2.65]) -AC_INIT([breseq], [0.35.6], [jeffrey.e.barrick@gmail.com], [breseq], [http://barricklab.org/breseq]) +AC_INIT([breseq], [0.35.7], [jeffrey.e.barrick@gmail.com], [breseq], [http://barricklab.org/breseq]) AC_CONFIG_AUX_DIR(aux_build) AC_CONFIG_MACRO_DIR([aux_build/m4]) AC_CONFIG_HEADERS([aux_build/config.h]) diff --git a/src/c/breseq/breseq_cmdline.cpp b/src/c/breseq/breseq_cmdline.cpp index c19ca3e6..3421c319 100644 --- a/src/c/breseq/breseq_cmdline.cpp +++ b/src/c/breseq/breseq_cmdline.cpp @@ -2270,138 +2270,173 @@ int breseq_default_action(int argc, char* argv[]) // Output Genome Diff File /// - cerr << "Creating merged genome diff evidence file..." << endl; + cGenomeDiff mpgd; + if (settings.do_step(settings.output_mutation_prediction_done_file_name, "Output :: Mutation Prediction")) { - // merge all of the evidence GenomeDiff files into one... - create_path(settings.evidence_path); - - cGenomeDiff jc_gd; - if (!settings.skip_new_junction_prediction) { - jc_gd.read(settings.jc_genome_diff_file_name); - } - - // Add read count information to the JC entries -- call fails if no predictions, because BAM does not exist - MutationPredictor mpj(ref_seq_info); - mpj.prepare_junctions(settings, summary, jc_gd); // this step has to be done twice currently, which is a bit wasteful - - assign_junction_read_counts(settings, summary, jc_gd); + cerr << "Creating merged genome diff evidence file..." << endl; - cGenomeDiff ra_mc_gd; - if (!settings.skip_read_alignment_and_missing_coverage_prediction) { - ra_mc_gd.read(settings.ra_mc_genome_diff_file_name); - } - test_RA_evidence(ra_mc_gd, ref_seq_info, settings); - - cGenomeDiff evidence_gd; - evidence_gd.merge_preserving_duplicates(jc_gd); - evidence_gd.merge_preserving_duplicates(ra_mc_gd); - - // there is a copy number genome diff for each sequence separately - if (settings.do_copy_number_variation) { - for (cReferenceSequences::iterator it = ref_seq_info.begin(); it != ref_seq_info.end(); ++it) { - cAnnotatedSequence& seq = *it; - string this_copy_number_variation_cn_genome_diff_file_name = settings.file_name(settings.copy_number_variation_cn_genome_diff_file_name, "@", seq.m_seq_id); - cGenomeDiff cn_gd(this_copy_number_variation_cn_genome_diff_file_name); - evidence_gd.merge_preserving_duplicates(cn_gd); + // merge all of the evidence GenomeDiff files into one... + create_path(settings.evidence_path); + + cGenomeDiff jc_gd; + if (!settings.skip_new_junction_prediction) { + jc_gd.read(settings.jc_genome_diff_file_name); } - } - evidence_gd.write(settings.evidence_genome_diff_file_name); + + // Add read count information to the JC entries -- call fails if no predictions, because BAM does not exist + MutationPredictor mpj(ref_seq_info); + mpj.prepare_junctions(settings, summary, jc_gd); // this step has to be done twice currently, which is a bit wasteful + + assign_junction_read_counts(settings, summary, jc_gd); - // predict mutations from evidence in the GenomeDiff - cerr << "Predicting mutations from evidence..." << endl; + cGenomeDiff ra_mc_gd; + if (!settings.skip_read_alignment_and_missing_coverage_prediction) { + ra_mc_gd.read(settings.ra_mc_genome_diff_file_name); + } + test_RA_evidence(ra_mc_gd, ref_seq_info, settings); + + cGenomeDiff evidence_gd; + evidence_gd.merge_preserving_duplicates(jc_gd); + evidence_gd.merge_preserving_duplicates(ra_mc_gd); + + // there is a copy number genome diff for each sequence separately + if (settings.do_copy_number_variation) { + for (cReferenceSequences::iterator it = ref_seq_info.begin(); it != ref_seq_info.end(); ++it) { + cAnnotatedSequence& seq = *it; + string this_copy_number_variation_cn_genome_diff_file_name = settings.file_name(settings.copy_number_variation_cn_genome_diff_file_name, "@", seq.m_seq_id); + cGenomeDiff cn_gd(this_copy_number_variation_cn_genome_diff_file_name); + evidence_gd.merge_preserving_duplicates(cn_gd); + } + } + evidence_gd.write(settings.evidence_genome_diff_file_name); - MutationPredictor mp(ref_seq_info); - cGenomeDiff mpgd(settings.evidence_genome_diff_file_name); - mp.predict(settings, summary, mpgd); - mpgd.reassign_unique_ids(); + // predict mutations from evidence in the GenomeDiff + cerr << "Predicting mutations from evidence..." << endl; - // Add metadata that will only be in the output.gd file - - //#=REFSEQ header lines. - mpgd.metadata.ref_seqs = settings.all_reference_file_names; - - //#=READSEQ header lines. - mpgd.metadata.read_seqs.resize(settings.read_files.size()); - for (size_t i = 0; i < settings.read_files.size(); i++) { - mpgd.metadata.read_seqs[i] = settings.read_files[i].file_name(); - } - - // These fields will be overwritten by any header GenomeDiff provided - mpgd.metadata.title = settings.custom_run_name; - - // Load metadata from existing header GenomeDiff file - // This is purposefully done AFTER the previous metadata defs - // in order to overwrite those (but not others) - if (settings.header_genome_diff_file_name.size() > 0) { - cGenomeDiff header_gd(settings.header_genome_diff_file_name); - mpgd.metadata = header_gd.metadata; - } - - // Fields here and below will write over any values in the header GenomeDiff file - mpgd.metadata.program = string(PACKAGE_STRING) + " " + string(HG_REVISION); - mpgd.metadata.command = settings.full_command_line; - mpgd.metadata.created = Settings::time2string(time(NULL)); - - // Add additional SUMMARY metadata - - mpgd.add_breseq_data("INPUT-READS", to_string(summary.sequence_conversion.num_original_reads)); - mpgd.add_breseq_data("INPUT-BASES", to_string(summary.sequence_conversion.num_original_bases)); - mpgd.add_breseq_data("CONVERTED-READS", to_string(summary.alignment_resolution.total_reads)); - mpgd.add_breseq_data("CONVERTED-BASES", to_string(summary.alignment_resolution.total_bases)); - mpgd.add_breseq_data("MAPPED-READS", to_string(static_cast(summary.alignment_resolution.total_reads_mapped_to_references))); - mpgd.add_breseq_data("MAPPED-BASES", to_string(summary.alignment_resolution.total_bases_mapped_to_references)); - - // Write the output.gd file - cerr << " Writing final GD file..." << endl; - mpgd.write(settings.final_genome_diff_file_name); - - // Save a copy in the data folder as well - mpgd.write(settings.data_genome_diff_file_name); + MutationPredictor mp(ref_seq_info); + mpgd.read(settings.evidence_genome_diff_file_name); + mp.predict(settings, summary, mpgd); + mpgd.reassign_unique_ids(); - // Faster, less compatible way... - // SYSTEM("cp " + settings.final_genome_diff_file_name + " " + settings.data_genome_diff_file_name); - - //Don't reload -- we lose invisible fields that we need - //cGenomeDiff gd(settings.final_genome_diff_file_name); - cGenomeDiff gd = mpgd; - - // Write VCF conversion (to both data and output) - cerr << " Writing final VCF file..." << endl; - mpgd.write_vcf(settings.data_vcf_file_name, ref_seq_info); - mpgd.write_vcf(settings.output_vcf_file_name, ref_seq_info); + // Add metadata that will only be in the output.gd file + + //#=REFSEQ header lines. + mpgd.metadata.ref_seqs = settings.all_reference_file_names; + + //#=READSEQ header lines. + mpgd.metadata.read_seqs.resize(settings.read_files.size()); + for (size_t i = 0; i < settings.read_files.size(); i++) { + mpgd.metadata.read_seqs[i] = settings.read_files[i].file_name(); + } + + // These fields will be overwritten by any header GenomeDiff provided + mpgd.metadata.title = settings.custom_run_name; + + // Load metadata from existing header GenomeDiff file + // This is purposefully done AFTER the previous metadata defs + // in order to overwrite those (but not others) + if (settings.header_genome_diff_file_name.size() > 0) { + cGenomeDiff header_gd(settings.header_genome_diff_file_name); + mpgd.metadata = header_gd.metadata; + } + + // Fields here and below will write over any values in the header GenomeDiff file + mpgd.metadata.program = string(PACKAGE_STRING) + " " + string(HG_REVISION); + mpgd.metadata.command = settings.full_command_line; + mpgd.metadata.created = Settings::time2string(time(NULL)); + + // Add additional SUMMARY metadata + + mpgd.add_breseq_data("INPUT-READS", to_string(summary.sequence_conversion.num_original_reads)); + mpgd.add_breseq_data("INPUT-BASES", to_string(summary.sequence_conversion.num_original_bases)); + mpgd.add_breseq_data("CONVERTED-READS", to_string(summary.alignment_resolution.total_reads)); + mpgd.add_breseq_data("CONVERTED-BASES", to_string(summary.alignment_resolution.total_bases)); + mpgd.add_breseq_data("MAPPED-READS", to_string(static_cast(summary.alignment_resolution.total_reads_mapped_to_references))); + mpgd.add_breseq_data("MAPPED-BASES", to_string(summary.alignment_resolution.total_bases_mapped_to_references)); + + // Write the output.gd file + cerr << " Writing final GD file..." << endl; + mpgd.write(settings.final_genome_diff_file_name); + + // Save a copy in the data folder as well + mpgd.write(settings.data_genome_diff_file_name); + + // Faster, less compatible way... + // SYSTEM("cp " + settings.final_genome_diff_file_name + " " + settings.data_genome_diff_file_name); + + // Save a copy with invisible fields in the annotation folder (this is what true does) + // We reload this to begin annotation if needed + mpgd.write(settings.preannotated_genome_diff_file_name, true); + settings.track_intermediate_file(settings.output_mutation_annotation_done_file_name, settings.preannotated_genome_diff_file_name); + + // Write VCF conversion (to both data and output) + cerr << " Writing final VCF file..." << endl; + mpgd.write_vcf(settings.data_vcf_file_name, ref_seq_info); + mpgd.write_vcf(settings.output_vcf_file_name, ref_seq_info); - // Write a final JSON file with all summary information (to both data and output) - PublicSummary public_summary(summary, settings, ref_seq_info); - public_summary.store(settings.data_json_summary_file_name); - public_summary.store(settings.output_json_summary_file_name); + // Write a final JSON file with all summary information (to both data and output) + PublicSummary public_summary(summary, settings, ref_seq_info); + public_summary.store(settings.data_json_summary_file_name); + public_summary.store(settings.output_json_summary_file_name); - - // - // Mark marginal items as no_show to prevent further processing - // - output::mark_gd_entries_no_show(settings, gd); + + settings.done_step(settings.output_mutation_prediction_done_file_name); - // - // Annotate mutations - // - cerr << "Annotating mutations..." << endl; - ref_seq_info.annotate_mutations(gd); + } - // Annotated Genome Diff output #1 - public version - gd.write(settings.data_annotated_genome_diff_file_name); + cGenomeDiff gd; + if (settings.do_step(settings.output_mutation_annotation_done_file_name, "Output :: Mutation Annotation")) { - // Annotated Genome Diff output #2 - used internally and for consistency tests - // - // Here we empty certain metadata items: CREATED time and PROGRAM breseq version - // that should be allowed to change without marking the output as failed! - gd.metadata.created=""; - gd.metadata.program=""; - gd.write(settings.annotated_genome_diff_file_name); - + // Might be faster to copy or use existing gd if already loaded? + + // Reload if necessary... (ex: interrupted in this step) +// gd = mpgd; +// (gd.get_list().size() == 0) { + gd.read(settings.preannotated_genome_diff_file_name); + // Blank this out of the title unless it has been reassigned, for consistency + if (gd.metadata.title == cString(settings.preannotated_genome_diff_file_name).get_base_name_no_extension()) { + gd.metadata.title = ""; + } + // } + + // + // Mark marginal items as no_show to prevent further processing + // + output::mark_gd_entries_no_show(settings, gd); + + // + // Annotate mutations + // + cerr << "Annotating mutations..." << endl; + ref_seq_info.annotate_mutations(gd); + + // Annotated Genome Diff output #1 - public version + gd.write(settings.data_annotated_genome_diff_file_name); + + // Annotated Genome Diff output #2 - used internally and for consistency tests + // + // Here we empty certain metadata items: CREATED time and PROGRAM breseq version + // that should be allowed to change without marking the output as failed! + gd.metadata.created=""; + gd.metadata.program=""; + gd.write(settings.annotated_genome_diff_file_name); + + settings.done_step(settings.output_mutation_annotation_done_file_name); + } + // // Plot coverage of genome and large deletions // + // Reload if necessary... (ex: interrupted in this step) + if (gd.get_list().size() == 0) { + gd.read(settings.annotated_genome_diff_file_name); + // Blank this out of the title unless it has been reassigned, for consistency + if (gd.metadata.title == cString(settings.annotated_genome_diff_file_name).get_base_name_no_extension()) { + gd.metadata.title = ""; + } + } + cerr << "Drawing coverage plots..." << endl; output::draw_coverage(settings, ref_seq_info, gd); diff --git a/src/c/breseq/genome_diff.cpp b/src/c/breseq/genome_diff.cpp index 4fd1882e..3b19a1a4 100644 --- a/src/c/breseq/genome_diff.cpp +++ b/src/c/breseq/genome_diff.cpp @@ -668,7 +668,7 @@ cFileParseErrors cGenomeDiff::valid_with_reference_sequences(cReferenceSequences 1) If you want a diff entry to be commented out(prefix with '#') add the key "comment_out" to the diff entry. */ -void cGenomeDiff::write(const string& filename) { +void cGenomeDiff::write(const string& filename, bool include_unprintable_fields) { string basename = cString(filename).get_base_name(); string dir = cString(filename).remove_ending(basename); if (dir.size()) { @@ -735,10 +735,10 @@ void cGenomeDiff::write(const string& filename) { for(diff_entry_list_t::iterator it=_entry_list.begin(); it!=_entry_list.end(); ++it) { if (!(*it)->entry_exists("comment_out")) { - fprintf(os, "%s\n", (**it).as_string().c_str()); + fprintf(os, "%s\n", (**it).as_string(include_unprintable_fields).c_str()); } else { (*it)->erase("comment_out"); - fprintf(os, "#%s\n", (**it).as_string().c_str()); + fprintf(os, "#%s\n", (**it).as_string(include_unprintable_fields).c_str()); } } os.close(); diff --git a/src/c/breseq/genome_diff_entry.cpp b/src/c/breseq/genome_diff_entry.cpp index b89c13fc..302db6f3 100644 --- a/src/c/breseq/genome_diff_entry.cpp +++ b/src/c/breseq/genome_diff_entry.cpp @@ -1185,7 +1185,7 @@ namespace breseq { /*! Marshal this diff entry into an ordered list of fields. */ - void cDiffEntry::marshal(vector& s) const { + void cDiffEntry::marshal(vector& s, bool include_unprintable_fields) const { s.push_back(gd_entry_type_lookup_table[_type]); s.push_back(_id); @@ -1221,7 +1221,7 @@ namespace breseq { for(diff_entry_map_t::iterator i=cp.begin(); i!=cp.end(); ++i) { assert(i->first.size()); - if (is_unprintable_key(i->first)) continue; + if (!include_unprintable_fields && is_unprintable_key(i->first)) continue; if (i->second.empty()) continue; // Be sure the entry is non-empty! Would rather have this as a check. @@ -1234,10 +1234,10 @@ namespace breseq { } // Created the line to be printed - string cDiffEntry::as_string(void) const + string cDiffEntry::as_string(bool include_unprintable_fields) const { vector fields; - marshal(fields); + marshal(fields, include_unprintable_fields); return join(fields, "\t"); } diff --git a/src/c/breseq/libbreseq/common.h b/src/c/breseq/libbreseq/common.h index 9a50d7e2..8b73abdc 100644 --- a/src/c/breseq/libbreseq/common.h +++ b/src/c/breseq/libbreseq/common.h @@ -1257,8 +1257,8 @@ inline cString cString::get_base_name() const //! Returns file name with no extension, removes any directory path beforehand. // input.1.gd -// remove_all_extensions = FALSE removes everything past the first period => input -// remove_all_extensions = TRUE removes only the last period and beyond => input.1 +// remove_all_extensions = TRUE removes everything past the first period => input +// remove_all_extensions = FALSE removes only the last period and beyond => input.1 inline cString cString::get_base_name_no_extension(bool remove_all_extensions, bool one_name_for_pair) const { cString this_return = this->get_base_name(); diff --git a/src/c/breseq/libbreseq/genome_diff.h b/src/c/breseq/libbreseq/genome_diff.h index 3f22d926..a3425231 100644 --- a/src/c/breseq/libbreseq/genome_diff.h +++ b/src/c/breseq/libbreseq/genome_diff.h @@ -127,7 +127,7 @@ class cGenomeDiff cFileParseErrors valid_with_reference_sequences(cReferenceSequences& ref_seq, bool suppress_errors = false); //! Write the genome diff to a file. - void write(const string& filename); + void write(const string& filename, bool include_unprintable_fields=false); //!---- Adding and Removing Entries ---- !// diff --git a/src/c/breseq/libbreseq/genome_diff_entry.h b/src/c/breseq/libbreseq/genome_diff_entry.h index bf0d35f8..37c8e8ca 100644 --- a/src/c/breseq/libbreseq/genome_diff_entry.h +++ b/src/c/breseq/libbreseq/genome_diff_entry.h @@ -325,10 +325,10 @@ namespace breseq { //!---- Output ---- !// //! Marshal this diff entry into an ordered list of fields. - virtual void marshal(vector& s) const; + virtual void marshal(vector& s, bool include_unprintable_fields=false) const; //! Serialize this diff entry into a string for output. - virtual string as_string(void) const; + virtual string as_string(bool include_unprintable_fields=false) const; //! Output all keys and values string as_key_values() const { diff --git a/src/c/breseq/libbreseq/settings.h b/src/c/breseq/libbreseq/settings.h index a10e2745..adcb5ecc 100644 --- a/src/c/breseq/libbreseq/settings.h +++ b/src/c/breseq/libbreseq/settings.h @@ -541,6 +541,10 @@ namespace breseq //! Paths: Output string output_path; string output_done_file_name; + // subsets of output + string output_mutation_prediction_done_file_name; + string output_mutation_annotation_done_file_name; + string output_json_summary_file_name; string output_vcf_file_name; @@ -553,6 +557,8 @@ namespace breseq string evidence_path; string evidence_genome_diff_file_name; string final_genome_diff_file_name; + string preannotated_genome_diff_file_name; + string annotated_genome_diff_file_name; string local_coverage_plot_path; diff --git a/src/c/breseq/reference_sequence.cpp b/src/c/breseq/reference_sequence.cpp index c010d810..455ba0ad 100644 --- a/src/c/breseq/reference_sequence.cpp +++ b/src/c/breseq/reference_sequence.cpp @@ -3520,7 +3520,7 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo } if (compare_key_list.size() == 0) compare_key_list.push_back(default_key); - map > snp_muts; + map > snp_muts; for (diff_entry_list_t::iterator it=muts.begin(); it!=muts.end(); it++) { @@ -3608,6 +3608,13 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo mut["ref_seq"] = this->get_sequence_1(mut[SEQ_ID], ref_start_1, ref_end_1); } + // @JEB: We look for repeat sequence in original reference sequence. + // This saves us from possibly looking at a shifted location... + if (mut._type == MOB) { + string rep_string = this->repeat_family_sequence(mut["repeat_name"], from_string(mut["strand"])); + mut["repeat_size"] = to_string(rep_string.length()); // saving this for shifting + } + categorize_1_mutation(mut, large_size_cutoff); } @@ -3628,18 +3635,73 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo } } - // Hugely inefficient step --!! + // @JEB 2020-05-24 Sped up this step by enforcing a lookahead distance and inserting wrapped entries + // // Scan SNPs to see if they affect the same codon, merge changes, and - // update amino acid changes assumes the list of snps is sorted. + // update amino acid changes. Assumes the lists of snps are sorted. + + const uint32_t k_num_entries_to_look_ahead(6); for(vector::iterator it_key = compare_key_list.begin(); it_key != compare_key_list.end(); it_key++) { string on_key = *it_key; - for (size_t i = 0; i < snp_muts[on_key].size(); i++){ + + /* + // Debug + cout << "====" << endl; + for (list::iterator it_i = snp_muts[on_key].begin(); it_i != snp_muts[on_key].end(); it_i++){ + cDiffEntry& i = **it_i; + cout << i.as_string() << endl; + } + */ + + // We need to fix copy entries from the beginning of each seq_id to the end of each seq_id in order to wrap around the origin + // There's no need to check if it is circular. The distance calculation later takes care of this + list seq_id_begin_list; + list& this_gd_list(snp_muts[on_key]); + + string current_seq_id; + uint32_t begin_count(0); + for (list::iterator it_i = this_gd_list.begin(); it_i != this_gd_list.end(); it_i++){ + cDiffEntry& i = **it_i; + if (i["seq_id"] != current_seq_id){ + begin_count = 0; + if (current_seq_id.size() > 0) { + // Insert after the one before... resetting it_i to the end of this seq_id + this_gd_list.insert(std::prev(it_i), seq_id_begin_list.begin(), seq_id_begin_list.end()); + seq_id_begin_list.clear(); + } + current_seq_id = i["seq_id"]; + } else { + begin_count++; + } + + // If we are at the beginning of the seq_id add to the list we will concatenate to the end of the seq_id + if (begin_count < k_num_entries_to_look_ahead) { + seq_id_begin_list.push_back(*it_i); + } + } + // Insert at the end of the last seq_id + this_gd_list.insert(this_gd_list.end(), seq_id_begin_list.begin(), seq_id_begin_list.end()); + + /* + // Debug + cout << "====" << endl; + for (list::iterator it_i = snp_muts[on_key].begin(); it_i != snp_muts[on_key].end(); it_i++){ + cDiffEntry& i = **it_i; + cout << i.as_string() << endl; + } + */ + + for (list::iterator it_i = snp_muts[on_key].begin(); it_i != snp_muts[on_key].end(); it_i++){ + cDiffEntry& i = **it_i; - string i_seq_id = (*snp_muts[on_key][i])["seq_id"]; - int32_t i_position = from_string((*snp_muts[on_key][i])["position"]); + //cout << "+++" << i.as_string() << endl; + - vector codon_number_list_i = split((*snp_muts[on_key][i])["codon_number"], multiple_separator); - vector codon_position_list_i = split((*snp_muts[on_key][i])["codon_position"], multiple_separator); + string i_seq_id = i["seq_id"]; + int32_t i_position = from_string(i["position"]); + + vector codon_number_list_i = split(i["codon_number"], multiple_separator); + vector codon_position_list_i = split(i["codon_position"], multiple_separator); for (size_t ii = 0; ii < codon_number_list_i.size(); ii++) { @@ -3647,11 +3709,17 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo continue; int32_t i_codon_number = from_string(codon_number_list_i[ii]); - for (size_t j = i + 1; j < snp_muts[on_key].size(); j++) { - + // @JEB -- now look at most 10 snp muts ahead... greatly bounding the search + uint32_t look_ahead_index = 0; + for (list::iterator it_j= std::next(it_i); it_j != snp_muts[on_key].end(); it_j++) { + look_ahead_index++; + if (look_ahead_index > k_num_entries_to_look_ahead) + break; + cDiffEntry& j = **it_j; + //cout << "***" << j.as_string() << endl; - string j_seq_id = (*snp_muts[on_key][j])["seq_id"]; - int32_t j_position = from_string((*snp_muts[on_key][j])["position"]); + string j_seq_id = j["seq_id"]; + int32_t j_position = from_string(j["position"]); if (i_seq_id != j_seq_id) continue; @@ -3660,11 +3728,11 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo if (i_position == j_position) continue; - vector codon_number_list_j = split((*snp_muts[on_key][j])["codon_number"], multiple_separator); - vector codon_position_list_j = split((*snp_muts[on_key][j])["codon_position"], multiple_separator); + vector codon_number_list_j = split((**it_j)["codon_number"], multiple_separator); + vector codon_position_list_j = split((**it_j)["codon_position"], multiple_separator); for (size_t jj = 0; jj < codon_number_list_j.size(); jj++) { - + if ((codon_number_list_j[jj] == "") || (codon_position_list_j[jj] == "")) continue; int32_t j_codon_number = from_string(codon_number_list_j[jj]); @@ -3679,65 +3747,65 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo continue; // Only do this for SNPs that are not 100% frequency - if ( ((*snp_muts[on_key][i]).entry_exists(FREQUENCY) && ( from_string((*snp_muts[on_key][i])[FREQUENCY]) != 1.0 )) - || ((*snp_muts[on_key][j]).entry_exists(FREQUENCY) && ( from_string((*snp_muts[on_key][j])[FREQUENCY]) != 1.0 )) ) + if ( (i.entry_exists(FREQUENCY) && ( from_string(i[FREQUENCY]) != 1.0 )) + || (j.entry_exists(FREQUENCY) && ( from_string(j[FREQUENCY]) != 1.0 )) ) { // Create the field for marking multiple SNPs in the same codon if necessary - if (!(*snp_muts[on_key][i]).entry_exists("multiple_polymorphic_SNPs_in_same_codon")) { + if (!i.entry_exists("multiple_polymorphic_SNPs_in_same_codon")) { vector empty(codon_number_list_i.size(), "0"); - (*snp_muts[on_key][i])["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator); + i["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator); } // Fill in this one as a multiple hit { - vector current = split((*snp_muts[on_key][i])["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator); + vector current = split(i["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator); current[ii] = "1"; - (*snp_muts[on_key][i])["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator); + i["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator); } // Create the field for marking multiple SNPs in the same codon if necessary - if (!(*snp_muts[on_key][j]).entry_exists("multiple_polymorphic_SNPs_in_same_codon")) { + if (!j.entry_exists("multiple_polymorphic_SNPs_in_same_codon")) { vector empty(codon_number_list_j.size(), "0"); - (*snp_muts[on_key][j])["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator); + j["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator); } // Fill in this one as a multiple hit { - vector current = split((*snp_muts[on_key][j])["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator); + vector current = split(j["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator); current[jj] = "1"; - (*snp_muts[on_key][j])["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator); + j["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator); } continue; } - vector codon_new_seq_list_i = split((*snp_muts[on_key][i])["codon_new_seq"], multiple_separator); + vector codon_new_seq_list_i = split(i["codon_new_seq"], multiple_separator); string new_codon = codon_new_seq_list_i[ii]; - char new_char = (*snp_muts[on_key][j])["new_seq"][0]; - if ((*snp_muts[on_key][i])["gene_strand"] == gene_strand_reverse_char) new_char = reverse_complement(new_char); + char new_char = j["new_seq"][0]; + if (i["gene_strand"] == gene_strand_reverse_char) new_char = reverse_complement(new_char); new_codon[j_codon_position - 1] = new_char; - vector codon_new_seq_list_j = split((*snp_muts[on_key][j])["codon_new_seq"], multiple_separator); - vector aa_new_seq_list_i = split((*snp_muts[on_key][i])["aa_new_seq"], multiple_separator); - vector aa_new_seq_list_j = split((*snp_muts[on_key][j])["aa_new_seq"], multiple_separator); + vector codon_new_seq_list_j = split(j["codon_new_seq"], multiple_separator); + vector aa_new_seq_list_i = split(i["aa_new_seq"], multiple_separator); + vector aa_new_seq_list_j = split(j["aa_new_seq"], multiple_separator); // If translation table is NA, then we don't translate for that gene! - vector transl_table_list_i = split((*snp_muts[on_key][i])["transl_table"], multiple_separator); - vector transl_table_list_j = split((*snp_muts[on_key][i])["transl_table"], multiple_separator); + vector transl_table_list_i = split(i["transl_table"], multiple_separator); + vector transl_table_list_j = split(j["transl_table"], multiple_separator); if (transl_table_list_i[ii] != "NA") { codon_new_seq_list_i[ii] = new_codon; - aa_new_seq_list_i[ii] = translate_codon(new_codon, from_string(transl_table_list_i[ii]), from_string((*snp_muts[on_key][i])["aa_position"])); + aa_new_seq_list_i[ii] = translate_codon(new_codon, from_string(transl_table_list_i[ii]), from_string(i["aa_position"])); } if (transl_table_list_i[jj] != "NA") { codon_new_seq_list_j[jj] = new_codon; - aa_new_seq_list_j[jj] = translate_codon(new_codon, from_string(transl_table_list_i[jj]), from_string((*snp_muts[on_key][j])["aa_position"])); + aa_new_seq_list_j[jj] = translate_codon(new_codon, from_string(transl_table_list_i[jj]), from_string(j["aa_position"])); } - (*snp_muts[on_key][i])["codon_new_seq"] = join(codon_new_seq_list_i, multiple_separator); - (*snp_muts[on_key][j])["codon_new_seq"] = join(codon_new_seq_list_j, multiple_separator); - (*snp_muts[on_key][i])["aa_new_seq"] = join(aa_new_seq_list_i, multiple_separator); - (*snp_muts[on_key][j])["aa_new_seq"] = join(aa_new_seq_list_j, multiple_separator); + i["codon_new_seq"] = join(codon_new_seq_list_i, multiple_separator); + j["codon_new_seq"] = join(codon_new_seq_list_j, multiple_separator); + i["aa_new_seq"] = join(aa_new_seq_list_i, multiple_separator); + j["aa_new_seq"] = join(aa_new_seq_list_j, multiple_separator); // @JEB: Debatable what we should do with SNP-type here. For now it remains that of the single mutations... } diff --git a/src/c/breseq/settings.cpp b/src/c/breseq/settings.cpp index f1a70a99..90321e61 100644 --- a/src/c/breseq/settings.cpp +++ b/src/c/breseq/settings.cpp @@ -1018,6 +1018,7 @@ namespace breseq this->output_path = "output"; if (this->base_output_path.size() > 0) this->output_path = this->base_output_path + "/" + this->output_path; this->output_done_file_name = this->output_path + "/output.done"; + this->output_json_summary_file_name = this->output_path + "/summary.json"; this->output_vcf_file_name = this->output_path + "/output.vcf"; @@ -1030,8 +1031,11 @@ namespace breseq this->evidence_path = this->output_path + "/" + this->local_evidence_path; this->evidence_genome_diff_file_name = this->evidence_path + "/evidence.gd"; this->final_genome_diff_file_name = this->output_path + "/output.gd"; + this->preannotated_genome_diff_file_name = this->evidence_path + "/preannotated.gd"; this->annotated_genome_diff_file_name = this->evidence_path + "/annotated.gd"; - + this->output_mutation_prediction_done_file_name = this->evidence_path + "/mutation_prediction.done"; + this->output_mutation_annotation_done_file_name = this->evidence_path + "/mutation_annotation.done"; + this->local_coverage_plot_path = "evidence"; this->coverage_plot_path = this->output_path + "/" + this->local_coverage_plot_path; this->coverage_plot_r_script_file_name = this->program_data_path + "/plot_coverage.r"; diff --git a/tests/gdtools_convert_2/expected.json b/tests/gdtools_convert_2/expected.json index 47bfeed5..fc3e7b01 100644 --- a/tests/gdtools_convert_2/expected.json +++ b/tests/gdtools_convert_2/expected.json @@ -28,6 +28,7 @@ "position_start": "1000", "ref_seq": "CCA", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -62,6 +63,7 @@ "position_start": "2000", "ref_seq": "GCAGCAC", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -96,6 +98,7 @@ "position_start": "3000", "ref_seq": "CTG", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -128,6 +131,7 @@ "position_start": "4000", "ref_seq": "GGGAATC", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "strand": "1", "type": "MOB" @@ -161,6 +165,7 @@ "position_start": "5000", "ref_seq": "AGGATC", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -196,6 +201,7 @@ "position_start": "6000", "ref_seq": "CAAT", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -232,6 +238,7 @@ "position_start": "7000", "ref_seq": "AGT", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -267,6 +274,7 @@ "position_start": "8000", "ref_seq": "ATG", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -305,6 +313,7 @@ "position_start": "15000", "ref_seq": "GAC", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -343,6 +352,7 @@ "position_start": "16000", "ref_seq": "CGAGAAA", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -381,6 +391,7 @@ "position_start": "17000", "ref_seq": "ACG", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -419,6 +430,7 @@ "position_start": "18000", "ref_seq": "CGTTTGG", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -453,6 +465,7 @@ "position_start": "21000", "ref_seq": "C", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -487,6 +500,7 @@ "position_start": "22000", "ref_seq": "T", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -521,6 +535,7 @@ "position_start": "23000", "ref_seq": "C", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -555,6 +570,7 @@ "position_start": "24000", "ref_seq": "CT", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -590,6 +606,7 @@ "position_start": "25000", "ref_seq": "GAAAG", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -625,6 +642,7 @@ "position_start": "26000", "ref_seq": "CG", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -660,6 +678,7 @@ "position_start": "27000", "ref_seq": "TCC", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", @@ -695,6 +714,7 @@ "position_start": "28000", "ref_seq": "C", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -733,6 +753,7 @@ "position_start": "35000", "ref_seq": "T", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -769,6 +790,7 @@ "position_start": "36000", "ref_seq": "CCGTATT", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "strand": "1", "type": "MOB" @@ -805,6 +827,7 @@ "position_start": "37000", "ref_seq": "T", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "-1", @@ -843,6 +866,7 @@ "position_start": "38000", "ref_seq": "AA", "repeat_name": "IS1", + "repeat_size": "768", "seq_id": "REL606-5", "snp_type": "", "strand": "1", diff --git a/tests/lambda_mixed_pop_custom_bowtie2/expected.gd b/tests/lambda_mixed_pop_custom_bowtie2/expected.gd index 9ff31944..51892343 100644 --- a/tests/lambda_mixed_pop_custom_bowtie2/expected.gd +++ b/tests/lambda_mixed_pop_custom_bowtie2/expected.gd @@ -1,5 +1,5 @@ #=GENOME_DIFF 1.0 -#=COMMAND ./src/c/breseq/breseq -j 4 -o tests/lambda_mixed_pop_custom_bowtie2 --bowtie2-scoring --ma 1 --mp 2 --np 1 --rdg 4,1 --rfg 4,1 --bowtie2-stage1 --local --score-min L,10,0.4 -k 20 --bowtie2-stage2 --bowtie2-junction --local --score-min L,10,0.30 -k 20 -r tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda.gbk tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda_mixed_population.fastq +#=COMMAND ./src/c/breseq/breseq -j 4 -o tests/lambda_mixed_pop_custom_bowtie2 --bowtie2-scoring --ma 1 --mp 2 --np 1 --rdg 4,1 --rfg 4,1 --bowtie2-stage1 --local --score-min L,10,0.4 -k 20 --bowtie2-stage2 --bowtie2-junction --local --score-min L,10,0.30 -k 20 -r tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda.gbk tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda_mixed_population.fastq #=REFSEQ tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda.gbk #=READSEQ tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda_mixed_population.fastq #=CONVERTED-BASES 6998495