From e0a5c1d1d4ec5e1e25a7fc93f4e250a5bab07a6b Mon Sep 17 00:00:00 2001
From: Jeffrey Barrick <jeffrey.e.barrick@gmail.com>
Date: Tue, 25 May 2021 17:16:43 -0500
Subject: [PATCH] Speed up and bug fix related to muliple base substitutions in
 a codon

---
 ChangeLog                                     |   4 +
 configure.ac                                  |   2 +-
 src/c/breseq/breseq_cmdline.cpp               | 267 ++++++++++--------
 src/c/breseq/genome_diff.cpp                  |   6 +-
 src/c/breseq/genome_diff_entry.cpp            |   8 +-
 src/c/breseq/libbreseq/common.h               |   4 +-
 src/c/breseq/libbreseq/genome_diff.h          |   2 +-
 src/c/breseq/libbreseq/genome_diff_entry.h    |   4 +-
 src/c/breseq/libbreseq/settings.h             |   6 +
 src/c/breseq/reference_sequence.cpp           | 146 +++++++---
 src/c/breseq/settings.cpp                     |   6 +-
 tests/gdtools_convert_2/expected.json         |  24 ++
 .../expected.gd                               |   2 +-
 13 files changed, 311 insertions(+), 170 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a81c2c3f..d2d84c5c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+Version 0.35.7
+  * Speedup and fix of rare bug related to annotating mutations to properly account for cases
+    in which multiple base substitutions occur in the same codon.
+
 Version 0.35.6
   * Should be compatible with GenBank reference files produced by Prokka and
     NCBI PGAP, and with GFF3 files produced by PGAP.
diff --git a/configure.ac b/configure.ac
index 8a39299d..304f11be 100644
--- a/configure.ac
+++ b/configure.ac
@@ -19,7 +19,7 @@
 #                                               -*- Autoconf -*-
 # Process this file with autoconf to produce a configure script.
 AC_PREREQ([2.65])
-AC_INIT([breseq], [0.35.6], [jeffrey.e.barrick@gmail.com], [breseq], [http://barricklab.org/breseq])
+AC_INIT([breseq], [0.35.7], [jeffrey.e.barrick@gmail.com], [breseq], [http://barricklab.org/breseq])
 AC_CONFIG_AUX_DIR(aux_build)
 AC_CONFIG_MACRO_DIR([aux_build/m4])
 AC_CONFIG_HEADERS([aux_build/config.h])
diff --git a/src/c/breseq/breseq_cmdline.cpp b/src/c/breseq/breseq_cmdline.cpp
index c19ca3e6..3421c319 100644
--- a/src/c/breseq/breseq_cmdline.cpp
+++ b/src/c/breseq/breseq_cmdline.cpp
@@ -2270,138 +2270,173 @@ int breseq_default_action(int argc, char* argv[])
     // Output Genome Diff File
     ///
 
-    cerr << "Creating merged genome diff evidence file..." << endl;
+    cGenomeDiff mpgd;
+    if (settings.do_step(settings.output_mutation_prediction_done_file_name, "Output :: Mutation Prediction")) {
 
-    // merge all of the evidence GenomeDiff files into one...
-    create_path(settings.evidence_path);
-    
-    cGenomeDiff jc_gd;
-    if (!settings.skip_new_junction_prediction) {
-      jc_gd.read(settings.jc_genome_diff_file_name);
-    }
-         
-    // Add read count information to the JC entries -- call fails if no predictions, because BAM does not exist
-    MutationPredictor mpj(ref_seq_info);
-    mpj.prepare_junctions(settings, summary, jc_gd); // this step has to be done twice currently, which is a bit wasteful
-    
-    assign_junction_read_counts(settings, summary, jc_gd);
+      cerr << "Creating merged genome diff evidence file..." << endl;
 
-    cGenomeDiff ra_mc_gd;
-    if (!settings.skip_read_alignment_and_missing_coverage_prediction) {
-      ra_mc_gd.read(settings.ra_mc_genome_diff_file_name);
-    }
-    test_RA_evidence(ra_mc_gd, ref_seq_info, settings);
-    
-    cGenomeDiff evidence_gd;
-    evidence_gd.merge_preserving_duplicates(jc_gd);
-    evidence_gd.merge_preserving_duplicates(ra_mc_gd);
-    
-    // there is a copy number genome diff for each sequence separately
-    if (settings.do_copy_number_variation) {
-      for (cReferenceSequences::iterator it = ref_seq_info.begin(); it != ref_seq_info.end(); ++it) {
-        cAnnotatedSequence& seq = *it;
-        string this_copy_number_variation_cn_genome_diff_file_name = settings.file_name(settings.copy_number_variation_cn_genome_diff_file_name, "@", seq.m_seq_id);
-        cGenomeDiff cn_gd(this_copy_number_variation_cn_genome_diff_file_name);
-        evidence_gd.merge_preserving_duplicates(cn_gd);
+      // merge all of the evidence GenomeDiff files into one...
+      create_path(settings.evidence_path);
+      
+      cGenomeDiff jc_gd;
+      if (!settings.skip_new_junction_prediction) {
+        jc_gd.read(settings.jc_genome_diff_file_name);
       }
-    }
-    evidence_gd.write(settings.evidence_genome_diff_file_name);
+           
+      // Add read count information to the JC entries -- call fails if no predictions, because BAM does not exist
+      MutationPredictor mpj(ref_seq_info);
+      mpj.prepare_junctions(settings, summary, jc_gd); // this step has to be done twice currently, which is a bit wasteful
+      
+      assign_junction_read_counts(settings, summary, jc_gd);
 
-    // predict mutations from evidence in the GenomeDiff
-    cerr << "Predicting mutations from evidence..." << endl;
+      cGenomeDiff ra_mc_gd;
+      if (!settings.skip_read_alignment_and_missing_coverage_prediction) {
+        ra_mc_gd.read(settings.ra_mc_genome_diff_file_name);
+      }
+      test_RA_evidence(ra_mc_gd, ref_seq_info, settings);
+      
+      cGenomeDiff evidence_gd;
+      evidence_gd.merge_preserving_duplicates(jc_gd);
+      evidence_gd.merge_preserving_duplicates(ra_mc_gd);
+      
+      // there is a copy number genome diff for each sequence separately
+      if (settings.do_copy_number_variation) {
+        for (cReferenceSequences::iterator it = ref_seq_info.begin(); it != ref_seq_info.end(); ++it) {
+          cAnnotatedSequence& seq = *it;
+          string this_copy_number_variation_cn_genome_diff_file_name = settings.file_name(settings.copy_number_variation_cn_genome_diff_file_name, "@", seq.m_seq_id);
+          cGenomeDiff cn_gd(this_copy_number_variation_cn_genome_diff_file_name);
+          evidence_gd.merge_preserving_duplicates(cn_gd);
+        }
+      }
+      evidence_gd.write(settings.evidence_genome_diff_file_name);
 
-    MutationPredictor mp(ref_seq_info);
-    cGenomeDiff mpgd(settings.evidence_genome_diff_file_name);
-    mp.predict(settings, summary, mpgd);
-    mpgd.reassign_unique_ids();
+      // predict mutations from evidence in the GenomeDiff
+      cerr << "Predicting mutations from evidence..." << endl;
 
-    // Add metadata that will only be in the output.gd file
-    
-    //#=REFSEQ header lines.
-    mpgd.metadata.ref_seqs = settings.all_reference_file_names;
-    
-    //#=READSEQ header lines.
-    mpgd.metadata.read_seqs.resize(settings.read_files.size());
-    for (size_t i = 0; i < settings.read_files.size(); i++) {
-      mpgd.metadata.read_seqs[i] = settings.read_files[i].file_name();
-    }
-    
-    // These fields will be overwritten by any header GenomeDiff provided
-    mpgd.metadata.title = settings.custom_run_name;
-    
-    // Load metadata from existing header GenomeDiff file
-    // This is purposefully done AFTER the previous metadata defs
-    // in order to overwrite those (but not others)
-    if (settings.header_genome_diff_file_name.size() > 0) {
-      cGenomeDiff header_gd(settings.header_genome_diff_file_name);
-      mpgd.metadata = header_gd.metadata;
-    }
-    
-    // Fields here and below will write over any values in the header GenomeDiff file
-    mpgd.metadata.program = string(PACKAGE_STRING) + " " + string(HG_REVISION);
-    mpgd.metadata.command = settings.full_command_line;
-    mpgd.metadata.created = Settings::time2string(time(NULL));
-    
-    // Add additional SUMMARY metadata
-    
-    mpgd.add_breseq_data("INPUT-READS", to_string(summary.sequence_conversion.num_original_reads));
-    mpgd.add_breseq_data("INPUT-BASES", to_string(summary.sequence_conversion.num_original_bases));
-    mpgd.add_breseq_data("CONVERTED-READS", to_string(summary.alignment_resolution.total_reads));
-    mpgd.add_breseq_data("CONVERTED-BASES", to_string(summary.alignment_resolution.total_bases));
-    mpgd.add_breseq_data("MAPPED-READS", to_string(static_cast<int64_t>(summary.alignment_resolution.total_reads_mapped_to_references)));
-    mpgd.add_breseq_data("MAPPED-BASES", to_string(summary.alignment_resolution.total_bases_mapped_to_references));
-
-    // Write the output.gd file
-    cerr << "  Writing final GD file..." << endl;
-    mpgd.write(settings.final_genome_diff_file_name);
-    
-    // Save a copy in the data folder as well
-    mpgd.write(settings.data_genome_diff_file_name);
+      MutationPredictor mp(ref_seq_info);
+      mpgd.read(settings.evidence_genome_diff_file_name);
+      mp.predict(settings, summary, mpgd);
+      mpgd.reassign_unique_ids();
 
-    // Faster, less compatible way...
-    // SYSTEM("cp " + settings.final_genome_diff_file_name + " " + settings.data_genome_diff_file_name);
-    
-    //Don't reload -- we lose invisible fields that we need
-    //cGenomeDiff gd(settings.final_genome_diff_file_name);
-    cGenomeDiff gd = mpgd;
-    
-    // Write VCF conversion (to both data and output)
-    cerr << "  Writing final VCF file..." << endl;
-    mpgd.write_vcf(settings.data_vcf_file_name, ref_seq_info);
-    mpgd.write_vcf(settings.output_vcf_file_name, ref_seq_info);
+      // Add metadata that will only be in the output.gd file
+      
+      //#=REFSEQ header lines.
+      mpgd.metadata.ref_seqs = settings.all_reference_file_names;
+      
+      //#=READSEQ header lines.
+      mpgd.metadata.read_seqs.resize(settings.read_files.size());
+      for (size_t i = 0; i < settings.read_files.size(); i++) {
+        mpgd.metadata.read_seqs[i] = settings.read_files[i].file_name();
+      }
+      
+      // These fields will be overwritten by any header GenomeDiff provided
+      mpgd.metadata.title = settings.custom_run_name;
+      
+      // Load metadata from existing header GenomeDiff file
+      // This is purposefully done AFTER the previous metadata defs
+      // in order to overwrite those (but not others)
+      if (settings.header_genome_diff_file_name.size() > 0) {
+        cGenomeDiff header_gd(settings.header_genome_diff_file_name);
+        mpgd.metadata = header_gd.metadata;
+      }
+      
+      // Fields here and below will write over any values in the header GenomeDiff file
+      mpgd.metadata.program = string(PACKAGE_STRING) + " " + string(HG_REVISION);
+      mpgd.metadata.command = settings.full_command_line;
+      mpgd.metadata.created = Settings::time2string(time(NULL));
+      
+      // Add additional SUMMARY metadata
+      
+      mpgd.add_breseq_data("INPUT-READS", to_string(summary.sequence_conversion.num_original_reads));
+      mpgd.add_breseq_data("INPUT-BASES", to_string(summary.sequence_conversion.num_original_bases));
+      mpgd.add_breseq_data("CONVERTED-READS", to_string(summary.alignment_resolution.total_reads));
+      mpgd.add_breseq_data("CONVERTED-BASES", to_string(summary.alignment_resolution.total_bases));
+      mpgd.add_breseq_data("MAPPED-READS", to_string(static_cast<int64_t>(summary.alignment_resolution.total_reads_mapped_to_references)));
+      mpgd.add_breseq_data("MAPPED-BASES", to_string(summary.alignment_resolution.total_bases_mapped_to_references));
+
+      // Write the output.gd file
+      cerr << "  Writing final GD file..." << endl;
+      mpgd.write(settings.final_genome_diff_file_name);
+      
+      // Save a copy in the data folder as well
+      mpgd.write(settings.data_genome_diff_file_name);
+      
+      // Faster, less compatible way...
+      // SYSTEM("cp " + settings.final_genome_diff_file_name + " " + settings.data_genome_diff_file_name);
+      
+      // Save a copy with invisible fields in the annotation folder (this is what true does)
+      // We reload this to begin annotation if needed
+      mpgd.write(settings.preannotated_genome_diff_file_name, true);
+      settings.track_intermediate_file(settings.output_mutation_annotation_done_file_name, settings.preannotated_genome_diff_file_name);
+      
+      // Write VCF conversion (to both data and output)
+      cerr << "  Writing final VCF file..." << endl;
+      mpgd.write_vcf(settings.data_vcf_file_name, ref_seq_info);
+      mpgd.write_vcf(settings.output_vcf_file_name, ref_seq_info);
 
-    // Write a final JSON file with all summary information (to both data and output)
-    PublicSummary public_summary(summary, settings, ref_seq_info);
-    public_summary.store(settings.data_json_summary_file_name);
-    public_summary.store(settings.output_json_summary_file_name);
+      // Write a final JSON file with all summary information (to both data and output)
+      PublicSummary public_summary(summary, settings, ref_seq_info);
+      public_summary.store(settings.data_json_summary_file_name);
+      public_summary.store(settings.output_json_summary_file_name);
 
-    
-    //
-    // Mark marginal items as no_show to prevent further processing
-    //
-    output::mark_gd_entries_no_show(settings, gd);
+      
+      settings.done_step(settings.output_mutation_prediction_done_file_name);
 
-    //
-		// Annotate mutations
-		//
-		cerr << "Annotating mutations..." << endl;
-		ref_seq_info.annotate_mutations(gd);
+    }
     
-    // Annotated Genome Diff output #1 - public version
-    gd.write(settings.data_annotated_genome_diff_file_name);
+    cGenomeDiff gd;
+    if (settings.do_step(settings.output_mutation_annotation_done_file_name, "Output :: Mutation Annotation")) {
 
-    // Annotated Genome Diff output #2 - used internally and for consistency tests
-    //
-    // Here we empty certain metadata items: CREATED time and PROGRAM breseq version
-    // that should be allowed to change without marking the output as failed!
-    gd.metadata.created="";
-    gd.metadata.program="";
-    gd.write(settings.annotated_genome_diff_file_name);
-    
+      // Might be faster to copy or use existing gd if already loaded?
+      
+      // Reload if necessary... (ex: interrupted in this step)
+//      gd = mpgd;
+//    (gd.get_list().size() == 0) {
+        gd.read(settings.preannotated_genome_diff_file_name);
+        // Blank this out of the title unless it has been reassigned, for consistency
+        if (gd.metadata.title == cString(settings.preannotated_genome_diff_file_name).get_base_name_no_extension()) {
+          gd.metadata.title = "";
+        }
+ //     }
+            
+      //
+      // Mark marginal items as no_show to prevent further processing
+      //
+      output::mark_gd_entries_no_show(settings, gd);
+
+      //
+      // Annotate mutations
+      //
+      cerr << "Annotating mutations..." << endl;
+      ref_seq_info.annotate_mutations(gd);
+      
+      // Annotated Genome Diff output #1 - public version
+      gd.write(settings.data_annotated_genome_diff_file_name);
+
+      // Annotated Genome Diff output #2 - used internally and for consistency tests
+      //
+      // Here we empty certain metadata items: CREATED time and PROGRAM breseq version
+      // that should be allowed to change without marking the output as failed!
+      gd.metadata.created="";
+      gd.metadata.program="";
+      gd.write(settings.annotated_genome_diff_file_name);
+      
+      settings.done_step(settings.output_mutation_annotation_done_file_name);
+    }
+      
 		//
 		// Plot coverage of genome and large deletions
 		//
     
+    // Reload if necessary... (ex: interrupted in this step)
+    if (gd.get_list().size() == 0) {
+      gd.read(settings.annotated_genome_diff_file_name);
+      // Blank this out of the title unless it has been reassigned, for consistency
+      if (gd.metadata.title == cString(settings.annotated_genome_diff_file_name).get_base_name_no_extension()) {
+        gd.metadata.title = "";
+      }
+    }
+
     cerr << "Drawing coverage plots..." << endl;
     output::draw_coverage(settings, ref_seq_info, gd);
     
diff --git a/src/c/breseq/genome_diff.cpp b/src/c/breseq/genome_diff.cpp
index 4fd1882e..3b19a1a4 100644
--- a/src/c/breseq/genome_diff.cpp
+++ b/src/c/breseq/genome_diff.cpp
@@ -668,7 +668,7 @@ cFileParseErrors cGenomeDiff::valid_with_reference_sequences(cReferenceSequences
  1) If you want a diff entry to be commented out(prefix with '#') add the key
  "comment_out" to the diff entry.
  */
-void cGenomeDiff::write(const string& filename) {
+void cGenomeDiff::write(const string& filename, bool include_unprintable_fields) {
   string basename = cString(filename).get_base_name();
   string dir = cString(filename).remove_ending(basename);
   if (dir.size()) {
@@ -735,10 +735,10 @@ void cGenomeDiff::write(const string& filename) {
   
   for(diff_entry_list_t::iterator it=_entry_list.begin(); it!=_entry_list.end(); ++it) {
     if (!(*it)->entry_exists("comment_out")) {
-      fprintf(os, "%s\n", (**it).as_string().c_str());
+      fprintf(os, "%s\n", (**it).as_string(include_unprintable_fields).c_str());
     } else {
       (*it)->erase("comment_out");
-      fprintf(os, "#%s\n", (**it).as_string().c_str());
+      fprintf(os, "#%s\n", (**it).as_string(include_unprintable_fields).c_str());
     }
   }
   os.close();
diff --git a/src/c/breseq/genome_diff_entry.cpp b/src/c/breseq/genome_diff_entry.cpp
index b89c13fc..302db6f3 100644
--- a/src/c/breseq/genome_diff_entry.cpp
+++ b/src/c/breseq/genome_diff_entry.cpp
@@ -1185,7 +1185,7 @@ namespace breseq {
   
   /*! Marshal this diff entry into an ordered list of fields.
    */
-  void cDiffEntry::marshal(vector<string>& s) const {
+  void cDiffEntry::marshal(vector<string>& s, bool include_unprintable_fields) const {
     s.push_back(gd_entry_type_lookup_table[_type]);
     s.push_back(_id);
     
@@ -1221,7 +1221,7 @@ namespace breseq {
     for(diff_entry_map_t::iterator i=cp.begin(); i!=cp.end(); ++i) {
       
       assert(i->first.size());
-      if (is_unprintable_key(i->first)) continue;
+      if (!include_unprintable_fields && is_unprintable_key(i->first)) continue;
       if (i->second.empty()) continue;
       
       // Be sure the entry is non-empty! Would rather have this as a check.
@@ -1234,10 +1234,10 @@ namespace breseq {
   }
   
   // Created the line to be printed
-  string cDiffEntry::as_string(void) const
+  string cDiffEntry::as_string(bool include_unprintable_fields) const
   {
     vector<string> fields;
-    marshal(fields);
+    marshal(fields, include_unprintable_fields);
     return join(fields, "\t");
   }
   
diff --git a/src/c/breseq/libbreseq/common.h b/src/c/breseq/libbreseq/common.h
index 9a50d7e2..8b73abdc 100644
--- a/src/c/breseq/libbreseq/common.h
+++ b/src/c/breseq/libbreseq/common.h
@@ -1257,8 +1257,8 @@ inline cString cString::get_base_name() const
 
 //! Returns file name with no extension, removes any directory path beforehand.
 //  input.1.gd
-//  remove_all_extensions = FALSE removes everything past the first period  => input 
-//  remove_all_extensions = TRUE removes only the last period and beyond   => input.1
+//  remove_all_extensions = TRUE removes everything past the first period  => input
+//  remove_all_extensions = FALSE removes only the last period and beyond   => input.1
 inline cString cString::get_base_name_no_extension(bool remove_all_extensions, bool one_name_for_pair) const
 {
   cString this_return = this->get_base_name();
diff --git a/src/c/breseq/libbreseq/genome_diff.h b/src/c/breseq/libbreseq/genome_diff.h
index 3f22d926..a3425231 100644
--- a/src/c/breseq/libbreseq/genome_diff.h
+++ b/src/c/breseq/libbreseq/genome_diff.h
@@ -127,7 +127,7 @@ class cGenomeDiff
   cFileParseErrors valid_with_reference_sequences(cReferenceSequences& ref_seq, bool suppress_errors = false);
     
   //! Write the genome diff to a file.
-  void write(const string& filename);
+  void write(const string& filename, bool include_unprintable_fields=false);
   
   
   //!---- Adding and Removing Entries ---- !//
diff --git a/src/c/breseq/libbreseq/genome_diff_entry.h b/src/c/breseq/libbreseq/genome_diff_entry.h
index bf0d35f8..37c8e8ca 100644
--- a/src/c/breseq/libbreseq/genome_diff_entry.h
+++ b/src/c/breseq/libbreseq/genome_diff_entry.h
@@ -325,10 +325,10 @@ namespace breseq {
     //!---- Output ---- !//
     
     //! Marshal this diff entry into an ordered list of fields.
-    virtual void marshal(vector<string>& s) const;
+    virtual void marshal(vector<string>& s, bool include_unprintable_fields=false) const;
     
     //! Serialize this diff entry into a string for output.
-    virtual string as_string(void) const;
+    virtual string as_string(bool include_unprintable_fields=false) const;
     
     //! Output all keys and values
     string as_key_values() const {
diff --git a/src/c/breseq/libbreseq/settings.h b/src/c/breseq/libbreseq/settings.h
index a10e2745..adcb5ecc 100644
--- a/src/c/breseq/libbreseq/settings.h
+++ b/src/c/breseq/libbreseq/settings.h
@@ -541,6 +541,10 @@ namespace breseq
     //! Paths: Output
     string output_path;
 		string output_done_file_name;
+    // subsets of output
+    string output_mutation_prediction_done_file_name;
+    string output_mutation_annotation_done_file_name;
+    
     string output_json_summary_file_name;
     string output_vcf_file_name;
 
@@ -553,6 +557,8 @@ namespace breseq
 		string evidence_path;
     string evidence_genome_diff_file_name;
     string final_genome_diff_file_name;
+    string preannotated_genome_diff_file_name;
+    
     string annotated_genome_diff_file_name;
 
 		string local_coverage_plot_path;
diff --git a/src/c/breseq/reference_sequence.cpp b/src/c/breseq/reference_sequence.cpp
index c010d810..455ba0ad 100644
--- a/src/c/breseq/reference_sequence.cpp
+++ b/src/c/breseq/reference_sequence.cpp
@@ -3520,7 +3520,7 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo
   }
   if (compare_key_list.size() == 0) compare_key_list.push_back(default_key);
   
-  map<string, vector<cDiffEntry*> > snp_muts;
+  map<string, list<cDiffEntry*> > snp_muts;
 
   for (diff_entry_list_t::iterator it=muts.begin(); it!=muts.end(); it++)
   {
@@ -3608,6 +3608,13 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo
         mut["ref_seq"] = this->get_sequence_1(mut[SEQ_ID], ref_start_1, ref_end_1);
       }
       
+      // @JEB: We look for repeat sequence in original reference sequence.
+      //       This saves us from possibly looking at a shifted location...
+      if (mut._type == MOB) {
+        string rep_string = this->repeat_family_sequence(mut["repeat_name"], from_string<int16_t>(mut["strand"]));
+        mut["repeat_size"] = to_string(rep_string.length()); // saving this for shifting
+      }
+      
       categorize_1_mutation(mut, large_size_cutoff);
     }
     
@@ -3628,18 +3635,73 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo
     }
   }
   
-  // Hugely inefficient step --!!
+  // @JEB 2020-05-24 Sped up this step by enforcing a lookahead distance and inserting wrapped entries
+  //
   // Scan SNPs to see if they affect the same codon, merge changes, and 
-  // update amino acid changes assumes the list of snps is sorted.
+  // update amino acid changes. Assumes the lists of snps are sorted.
+  
+  const uint32_t k_num_entries_to_look_ahead(6);
   for(vector<string>::iterator it_key = compare_key_list.begin(); it_key != compare_key_list.end(); it_key++) {
     string on_key = *it_key;
-    for (size_t i = 0; i < snp_muts[on_key].size(); i++){
+    
+    /*
+    // Debug
+    cout << "====" << endl;
+    for (list<cDiffEntry*>::iterator it_i = snp_muts[on_key].begin(); it_i != snp_muts[on_key].end(); it_i++){
+      cDiffEntry& i = **it_i;
+      cout << i.as_string() << endl;
+    }
+     */
+    
+    // We need to fix copy entries from the beginning of each seq_id to the end of each seq_id in order to wrap around the origin
+    // There's no need to check if it is circular. The distance calculation later takes care of this
+    list<cDiffEntry*> seq_id_begin_list;
+    list<cDiffEntry*>& this_gd_list(snp_muts[on_key]);
+    
+    string current_seq_id;
+    uint32_t begin_count(0);
+    for (list<cDiffEntry*>::iterator it_i = this_gd_list.begin(); it_i != this_gd_list.end(); it_i++){
+      cDiffEntry& i = **it_i;
+      if (i["seq_id"] != current_seq_id){
+        begin_count = 0;
+        if (current_seq_id.size() > 0) {
+          // Insert after the one before... resetting it_i to the end of this seq_id
+          this_gd_list.insert(std::prev(it_i), seq_id_begin_list.begin(), seq_id_begin_list.end());
+          seq_id_begin_list.clear();
+        }
+        current_seq_id = i["seq_id"];
+      } else {
+        begin_count++;
+      }
+      
+      // If we are at the beginning of the seq_id add to the list we will concatenate to the end of the seq_id
+      if (begin_count < k_num_entries_to_look_ahead) {
+        seq_id_begin_list.push_back(*it_i);
+      }
+    }
+    // Insert at the end of the last seq_id
+    this_gd_list.insert(this_gd_list.end(), seq_id_begin_list.begin(), seq_id_begin_list.end());
+    
+    /*
+    // Debug
+    cout << "====" << endl;
+    for (list<cDiffEntry*>::iterator it_i = snp_muts[on_key].begin(); it_i != snp_muts[on_key].end(); it_i++){
+      cDiffEntry& i = **it_i;
+      cout << i.as_string() << endl;
+    }
+     */
+    
+    for (list<cDiffEntry*>::iterator it_i = snp_muts[on_key].begin(); it_i != snp_muts[on_key].end(); it_i++){
+      cDiffEntry& i = **it_i;
       
-      string i_seq_id = (*snp_muts[on_key][i])["seq_id"];
-      int32_t i_position = from_string<int32_t>((*snp_muts[on_key][i])["position"]);
+      //cout << "+++" << i.as_string() << endl;
+
       
-      vector<string> codon_number_list_i = split((*snp_muts[on_key][i])["codon_number"], multiple_separator);
-      vector<string> codon_position_list_i = split((*snp_muts[on_key][i])["codon_position"], multiple_separator);
+      string i_seq_id = i["seq_id"];
+      int32_t i_position = from_string<int32_t>(i["position"]);
+      
+      vector<string> codon_number_list_i = split(i["codon_number"], multiple_separator);
+      vector<string> codon_position_list_i = split(i["codon_position"], multiple_separator);
 
       for (size_t ii = 0; ii < codon_number_list_i.size(); ii++) {
       
@@ -3647,11 +3709,17 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo
           continue;
         int32_t i_codon_number = from_string<int32_t>(codon_number_list_i[ii]);
         
-        for (size_t j = i + 1; j < snp_muts[on_key].size(); j++) {
-          
+        // @JEB -- now look at most 10 snp muts ahead... greatly bounding the search
+        uint32_t look_ahead_index = 0;
+        for (list<cDiffEntry*>::iterator it_j= std::next(it_i); it_j != snp_muts[on_key].end(); it_j++) {
+          look_ahead_index++;
+          if (look_ahead_index > k_num_entries_to_look_ahead)
+            break;
+          cDiffEntry& j = **it_j;
+          //cout << "***" << j.as_string() << endl;
           
-          string j_seq_id = (*snp_muts[on_key][j])["seq_id"];
-          int32_t j_position = from_string<int32_t>((*snp_muts[on_key][j])["position"]);
+          string j_seq_id = j["seq_id"];
+          int32_t j_position = from_string<int32_t>(j["position"]);
           
           if (i_seq_id != j_seq_id)
             continue;
@@ -3660,11 +3728,11 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo
           if (i_position == j_position)
             continue;
           
-          vector<string> codon_number_list_j = split((*snp_muts[on_key][j])["codon_number"], multiple_separator);
-          vector<string> codon_position_list_j = split((*snp_muts[on_key][j])["codon_position"], multiple_separator);
+          vector<string> codon_number_list_j = split((**it_j)["codon_number"], multiple_separator);
+          vector<string> codon_position_list_j = split((**it_j)["codon_position"], multiple_separator);
           
           for (size_t jj = 0; jj < codon_number_list_j.size(); jj++) {
-
+            
             if ((codon_number_list_j[jj] == "") || (codon_position_list_j[jj] == ""))
               continue;
             int32_t j_codon_number = from_string<int32_t>(codon_number_list_j[jj]);
@@ -3679,65 +3747,65 @@ void cReferenceSequences::annotate_mutations(cGenomeDiff& gd, bool only_muts, bo
               continue;
             
             // Only do this for SNPs that are not 100% frequency
-            if (   ((*snp_muts[on_key][i]).entry_exists(FREQUENCY) && ( from_string<double>((*snp_muts[on_key][i])[FREQUENCY]) != 1.0 ))
-                || ((*snp_muts[on_key][j]).entry_exists(FREQUENCY) && ( from_string<double>((*snp_muts[on_key][j])[FREQUENCY]) != 1.0 )) )
+            if (   (i.entry_exists(FREQUENCY) && ( from_string<double>(i[FREQUENCY]) != 1.0 ))
+                || (j.entry_exists(FREQUENCY) && ( from_string<double>(j[FREQUENCY]) != 1.0 )) )
             {
               
               // Create the field for marking multiple SNPs in the same codon if necessary
-              if (!(*snp_muts[on_key][i]).entry_exists("multiple_polymorphic_SNPs_in_same_codon")) {
+              if (!i.entry_exists("multiple_polymorphic_SNPs_in_same_codon")) {
                 vector<string> empty(codon_number_list_i.size(), "0");
-                (*snp_muts[on_key][i])["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator);
+                i["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator);
               }
                 
               // Fill in this one as a multiple hit
               {
-                vector<string> current = split((*snp_muts[on_key][i])["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator);
+                vector<string> current = split(i["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator);
                 current[ii] = "1";
-                (*snp_muts[on_key][i])["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator);
+                i["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator);
               }
               
               // Create the field for marking multiple SNPs in the same codon if necessary
-              if (!(*snp_muts[on_key][j]).entry_exists("multiple_polymorphic_SNPs_in_same_codon")) {
+              if (!j.entry_exists("multiple_polymorphic_SNPs_in_same_codon")) {
                 vector<string> empty(codon_number_list_j.size(), "0");
-                (*snp_muts[on_key][j])["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator);
+                j["multiple_polymorphic_SNPs_in_same_codon"] = join(empty, multiple_separator);
               }
               
               // Fill in this one as a multiple hit
               {
-                vector<string> current = split((*snp_muts[on_key][j])["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator);
+                vector<string> current = split(j["multiple_polymorphic_SNPs_in_same_codon"], multiple_separator);
                 current[jj] = "1";
-                (*snp_muts[on_key][j])["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator);
+                j["multiple_polymorphic_SNPs_in_same_codon"] = join(current, multiple_separator);
               }
               
               continue;
             }
             
-            vector<string> codon_new_seq_list_i = split((*snp_muts[on_key][i])["codon_new_seq"], multiple_separator);
+            vector<string> codon_new_seq_list_i = split(i["codon_new_seq"], multiple_separator);
             string new_codon = codon_new_seq_list_i[ii];
-            char new_char = (*snp_muts[on_key][j])["new_seq"][0];
-            if ((*snp_muts[on_key][i])["gene_strand"] == gene_strand_reverse_char) new_char = reverse_complement(new_char);
+            char new_char = j["new_seq"][0];
+            if (i["gene_strand"] == gene_strand_reverse_char) new_char = reverse_complement(new_char);
             new_codon[j_codon_position - 1] = new_char;
             
-            vector<string> codon_new_seq_list_j = split((*snp_muts[on_key][j])["codon_new_seq"], multiple_separator);
-            vector<string> aa_new_seq_list_i = split((*snp_muts[on_key][i])["aa_new_seq"], multiple_separator);
-            vector<string> aa_new_seq_list_j = split((*snp_muts[on_key][j])["aa_new_seq"], multiple_separator);
+            vector<string> codon_new_seq_list_j = split(j["codon_new_seq"], multiple_separator);
+            vector<string> aa_new_seq_list_i = split(i["aa_new_seq"], multiple_separator);
+            vector<string> aa_new_seq_list_j = split(j["aa_new_seq"], multiple_separator);
             
             // If translation table is NA, then we don't translate for that gene!
-            vector<string> transl_table_list_i = split((*snp_muts[on_key][i])["transl_table"], multiple_separator);
-            vector<string> transl_table_list_j = split((*snp_muts[on_key][i])["transl_table"], multiple_separator);
+            vector<string> transl_table_list_i = split(i["transl_table"], multiple_separator);
+            vector<string> transl_table_list_j = split(j["transl_table"], multiple_separator);
 
             if (transl_table_list_i[ii] != "NA") {
               codon_new_seq_list_i[ii] = new_codon;
-              aa_new_seq_list_i[ii] =  translate_codon(new_codon, from_string(transl_table_list_i[ii]), from_string((*snp_muts[on_key][i])["aa_position"]));
+              aa_new_seq_list_i[ii] =  translate_codon(new_codon, from_string(transl_table_list_i[ii]), from_string(i["aa_position"]));
             }
             if (transl_table_list_i[jj] != "NA") {
               codon_new_seq_list_j[jj] = new_codon;
-              aa_new_seq_list_j[jj] =  translate_codon(new_codon, from_string(transl_table_list_i[jj]), from_string((*snp_muts[on_key][j])["aa_position"]));
+              aa_new_seq_list_j[jj] =  translate_codon(new_codon, from_string(transl_table_list_i[jj]), from_string(j["aa_position"]));
             }
-            (*snp_muts[on_key][i])["codon_new_seq"] = join(codon_new_seq_list_i, multiple_separator);
-            (*snp_muts[on_key][j])["codon_new_seq"] = join(codon_new_seq_list_j, multiple_separator);
-            (*snp_muts[on_key][i])["aa_new_seq"] = join(aa_new_seq_list_i, multiple_separator);
-            (*snp_muts[on_key][j])["aa_new_seq"] = join(aa_new_seq_list_j, multiple_separator);
+            i["codon_new_seq"] = join(codon_new_seq_list_i, multiple_separator);
+            j["codon_new_seq"] = join(codon_new_seq_list_j, multiple_separator);
+            i["aa_new_seq"] = join(aa_new_seq_list_i, multiple_separator);
+            j["aa_new_seq"] = join(aa_new_seq_list_j, multiple_separator);
             
             // @JEB: Debatable what we should do with SNP-type here. For now it remains that of the single mutations...
           }
diff --git a/src/c/breseq/settings.cpp b/src/c/breseq/settings.cpp
index f1a70a99..90321e61 100644
--- a/src/c/breseq/settings.cpp
+++ b/src/c/breseq/settings.cpp
@@ -1018,6 +1018,7 @@ namespace breseq
 		this->output_path = "output";
 		if (this->base_output_path.size() > 0) this->output_path = this->base_output_path + "/" + this->output_path;
 		this->output_done_file_name = this->output_path + "/output.done";
+    
     this->output_json_summary_file_name = this->output_path + "/summary.json";
     this->output_vcf_file_name = this->output_path + "/output.vcf";
 
@@ -1030,8 +1031,11 @@ namespace breseq
 		this->evidence_path = this->output_path + "/" + this->local_evidence_path;
 		this->evidence_genome_diff_file_name = this->evidence_path + "/evidence.gd";
 		this->final_genome_diff_file_name = this->output_path + "/output.gd";
+    this->preannotated_genome_diff_file_name = this->evidence_path + "/preannotated.gd";
     this->annotated_genome_diff_file_name = this->evidence_path + "/annotated.gd";
-    
+    this->output_mutation_prediction_done_file_name = this->evidence_path + "/mutation_prediction.done";
+    this->output_mutation_annotation_done_file_name = this->evidence_path + "/mutation_annotation.done";
+
 		this->local_coverage_plot_path = "evidence";
 		this->coverage_plot_path = this->output_path + "/" + this->local_coverage_plot_path;
     this->coverage_plot_r_script_file_name = this->program_data_path + "/plot_coverage.r";    
diff --git a/tests/gdtools_convert_2/expected.json b/tests/gdtools_convert_2/expected.json
index 47bfeed5..fc3e7b01 100644
--- a/tests/gdtools_convert_2/expected.json
+++ b/tests/gdtools_convert_2/expected.json
@@ -28,6 +28,7 @@
       "position_start": "1000",
       "ref_seq": "CCA",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -62,6 +63,7 @@
       "position_start": "2000",
       "ref_seq": "GCAGCAC",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -96,6 +98,7 @@
       "position_start": "3000",
       "ref_seq": "CTG",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -128,6 +131,7 @@
       "position_start": "4000",
       "ref_seq": "GGGAATC",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "strand": "1",
       "type": "MOB"
@@ -161,6 +165,7 @@
       "position_start": "5000",
       "ref_seq": "AGGATC",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -196,6 +201,7 @@
       "position_start": "6000",
       "ref_seq": "CAAT",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -232,6 +238,7 @@
       "position_start": "7000",
       "ref_seq": "AGT",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -267,6 +274,7 @@
       "position_start": "8000",
       "ref_seq": "ATG",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -305,6 +313,7 @@
       "position_start": "15000",
       "ref_seq": "GAC",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -343,6 +352,7 @@
       "position_start": "16000",
       "ref_seq": "CGAGAAA",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -381,6 +391,7 @@
       "position_start": "17000",
       "ref_seq": "ACG",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -419,6 +430,7 @@
       "position_start": "18000",
       "ref_seq": "CGTTTGG",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -453,6 +465,7 @@
       "position_start": "21000",
       "ref_seq": "C",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -487,6 +500,7 @@
       "position_start": "22000",
       "ref_seq": "T",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -521,6 +535,7 @@
       "position_start": "23000",
       "ref_seq": "C",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -555,6 +570,7 @@
       "position_start": "24000",
       "ref_seq": "CT",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -590,6 +606,7 @@
       "position_start": "25000",
       "ref_seq": "GAAAG",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -625,6 +642,7 @@
       "position_start": "26000",
       "ref_seq": "CG",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -660,6 +678,7 @@
       "position_start": "27000",
       "ref_seq": "TCC",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
@@ -695,6 +714,7 @@
       "position_start": "28000",
       "ref_seq": "C",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -733,6 +753,7 @@
       "position_start": "35000",
       "ref_seq": "T",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -769,6 +790,7 @@
       "position_start": "36000",
       "ref_seq": "CCGTATT",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "strand": "1",
       "type": "MOB"
@@ -805,6 +827,7 @@
       "position_start": "37000",
       "ref_seq": "T",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "-1",
@@ -843,6 +866,7 @@
       "position_start": "38000",
       "ref_seq": "AA",
       "repeat_name": "IS1",
+      "repeat_size": "768",
       "seq_id": "REL606-5",
       "snp_type": "",
       "strand": "1",
diff --git a/tests/lambda_mixed_pop_custom_bowtie2/expected.gd b/tests/lambda_mixed_pop_custom_bowtie2/expected.gd
index 9ff31944..51892343 100644
--- a/tests/lambda_mixed_pop_custom_bowtie2/expected.gd
+++ b/tests/lambda_mixed_pop_custom_bowtie2/expected.gd
@@ -1,5 +1,5 @@
 #=GENOME_DIFF	1.0
-#=COMMAND	./src/c/breseq/breseq -j 4 -o tests/lambda_mixed_pop_custom_bowtie2 --bowtie2-scoring --ma 1 --mp 2 --np 1 --rdg 4,1 --rfg 4,1 --bowtie2-stage1 --local --score-min L,10,0.4 -k 20 --bowtie2-stage2  --bowtie2-junction --local --score-min L,10,0.30  -k 20 -r tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda.gbk tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda_mixed_population.fastq
+#=COMMAND	./src/c/breseq/breseq -j 4 -o tests/lambda_mixed_pop_custom_bowtie2 --bowtie2-scoring --ma 1 --mp 2 --np 1 --rdg 4,1 --rfg 4,1 --bowtie2-stage1 --local --score-min L,10,0.4 -k 20 --bowtie2-stage2 --bowtie2-junction --local --score-min L,10,0.30 -k 20 -r tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda.gbk tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda_mixed_population.fastq
 #=REFSEQ	tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda.gbk
 #=READSEQ	tests/lambda_mixed_pop_custom_bowtie2/../data/lambda/lambda_mixed_population.fastq
 #=CONVERTED-BASES	6998495