Skip to content

Commit

Permalink
Speed up and bug fix related to muliple base substitutions in a codon
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffreybarrick committed May 25, 2021
1 parent c98392a commit e0a5c1d
Show file tree
Hide file tree
Showing 13 changed files with 311 additions and 170 deletions.
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Version 0.35.7
* Speedup and fix of rare bug related to annotating mutations to properly account for cases
in which multiple base substitutions occur in the same codon.

Version 0.35.6
* Should be compatible with GenBank reference files produced by Prokka and
NCBI PGAP, and with GFF3 files produced by PGAP.
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.65])
AC_INIT([breseq], [0.35.6], [jeffrey.e.barrick@gmail.com], [breseq], [http://barricklab.org/breseq])
AC_INIT([breseq], [0.35.7], [jeffrey.e.barrick@gmail.com], [breseq], [http://barricklab.org/breseq])
AC_CONFIG_AUX_DIR(aux_build)
AC_CONFIG_MACRO_DIR([aux_build/m4])
AC_CONFIG_HEADERS([aux_build/config.h])
Expand Down
267 changes: 151 additions & 116 deletions src/c/breseq/breseq_cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2270,138 +2270,173 @@ int breseq_default_action(int argc, char* argv[])
// Output Genome Diff File
///

cerr << "Creating merged genome diff evidence file..." << endl;
cGenomeDiff mpgd;
if (settings.do_step(settings.output_mutation_prediction_done_file_name, "Output :: Mutation Prediction")) {

// merge all of the evidence GenomeDiff files into one...
create_path(settings.evidence_path);

cGenomeDiff jc_gd;
if (!settings.skip_new_junction_prediction) {
jc_gd.read(settings.jc_genome_diff_file_name);
}

// Add read count information to the JC entries -- call fails if no predictions, because BAM does not exist
MutationPredictor mpj(ref_seq_info);
mpj.prepare_junctions(settings, summary, jc_gd); // this step has to be done twice currently, which is a bit wasteful

assign_junction_read_counts(settings, summary, jc_gd);
cerr << "Creating merged genome diff evidence file..." << endl;

cGenomeDiff ra_mc_gd;
if (!settings.skip_read_alignment_and_missing_coverage_prediction) {
ra_mc_gd.read(settings.ra_mc_genome_diff_file_name);
}
test_RA_evidence(ra_mc_gd, ref_seq_info, settings);

cGenomeDiff evidence_gd;
evidence_gd.merge_preserving_duplicates(jc_gd);
evidence_gd.merge_preserving_duplicates(ra_mc_gd);

// there is a copy number genome diff for each sequence separately
if (settings.do_copy_number_variation) {
for (cReferenceSequences::iterator it = ref_seq_info.begin(); it != ref_seq_info.end(); ++it) {
cAnnotatedSequence& seq = *it;
string this_copy_number_variation_cn_genome_diff_file_name = settings.file_name(settings.copy_number_variation_cn_genome_diff_file_name, "@", seq.m_seq_id);
cGenomeDiff cn_gd(this_copy_number_variation_cn_genome_diff_file_name);
evidence_gd.merge_preserving_duplicates(cn_gd);
// merge all of the evidence GenomeDiff files into one...
create_path(settings.evidence_path);

cGenomeDiff jc_gd;
if (!settings.skip_new_junction_prediction) {
jc_gd.read(settings.jc_genome_diff_file_name);
}
}
evidence_gd.write(settings.evidence_genome_diff_file_name);

// Add read count information to the JC entries -- call fails if no predictions, because BAM does not exist
MutationPredictor mpj(ref_seq_info);
mpj.prepare_junctions(settings, summary, jc_gd); // this step has to be done twice currently, which is a bit wasteful

assign_junction_read_counts(settings, summary, jc_gd);

// predict mutations from evidence in the GenomeDiff
cerr << "Predicting mutations from evidence..." << endl;
cGenomeDiff ra_mc_gd;
if (!settings.skip_read_alignment_and_missing_coverage_prediction) {
ra_mc_gd.read(settings.ra_mc_genome_diff_file_name);
}
test_RA_evidence(ra_mc_gd, ref_seq_info, settings);

cGenomeDiff evidence_gd;
evidence_gd.merge_preserving_duplicates(jc_gd);
evidence_gd.merge_preserving_duplicates(ra_mc_gd);

// there is a copy number genome diff for each sequence separately
if (settings.do_copy_number_variation) {
for (cReferenceSequences::iterator it = ref_seq_info.begin(); it != ref_seq_info.end(); ++it) {
cAnnotatedSequence& seq = *it;
string this_copy_number_variation_cn_genome_diff_file_name = settings.file_name(settings.copy_number_variation_cn_genome_diff_file_name, "@", seq.m_seq_id);
cGenomeDiff cn_gd(this_copy_number_variation_cn_genome_diff_file_name);
evidence_gd.merge_preserving_duplicates(cn_gd);
}
}
evidence_gd.write(settings.evidence_genome_diff_file_name);

MutationPredictor mp(ref_seq_info);
cGenomeDiff mpgd(settings.evidence_genome_diff_file_name);
mp.predict(settings, summary, mpgd);
mpgd.reassign_unique_ids();
// predict mutations from evidence in the GenomeDiff
cerr << "Predicting mutations from evidence..." << endl;

// Add metadata that will only be in the output.gd file

//#=REFSEQ header lines.
mpgd.metadata.ref_seqs = settings.all_reference_file_names;

//#=READSEQ header lines.
mpgd.metadata.read_seqs.resize(settings.read_files.size());
for (size_t i = 0; i < settings.read_files.size(); i++) {
mpgd.metadata.read_seqs[i] = settings.read_files[i].file_name();
}

// These fields will be overwritten by any header GenomeDiff provided
mpgd.metadata.title = settings.custom_run_name;

// Load metadata from existing header GenomeDiff file
// This is purposefully done AFTER the previous metadata defs
// in order to overwrite those (but not others)
if (settings.header_genome_diff_file_name.size() > 0) {
cGenomeDiff header_gd(settings.header_genome_diff_file_name);
mpgd.metadata = header_gd.metadata;
}

// Fields here and below will write over any values in the header GenomeDiff file
mpgd.metadata.program = string(PACKAGE_STRING) + " " + string(HG_REVISION);
mpgd.metadata.command = settings.full_command_line;
mpgd.metadata.created = Settings::time2string(time(NULL));

// Add additional SUMMARY metadata

mpgd.add_breseq_data("INPUT-READS", to_string(summary.sequence_conversion.num_original_reads));
mpgd.add_breseq_data("INPUT-BASES", to_string(summary.sequence_conversion.num_original_bases));
mpgd.add_breseq_data("CONVERTED-READS", to_string(summary.alignment_resolution.total_reads));
mpgd.add_breseq_data("CONVERTED-BASES", to_string(summary.alignment_resolution.total_bases));
mpgd.add_breseq_data("MAPPED-READS", to_string(static_cast<int64_t>(summary.alignment_resolution.total_reads_mapped_to_references)));
mpgd.add_breseq_data("MAPPED-BASES", to_string(summary.alignment_resolution.total_bases_mapped_to_references));

// Write the output.gd file
cerr << " Writing final GD file..." << endl;
mpgd.write(settings.final_genome_diff_file_name);

// Save a copy in the data folder as well
mpgd.write(settings.data_genome_diff_file_name);
MutationPredictor mp(ref_seq_info);
mpgd.read(settings.evidence_genome_diff_file_name);
mp.predict(settings, summary, mpgd);
mpgd.reassign_unique_ids();

// Faster, less compatible way...
// SYSTEM("cp " + settings.final_genome_diff_file_name + " " + settings.data_genome_diff_file_name);

//Don't reload -- we lose invisible fields that we need
//cGenomeDiff gd(settings.final_genome_diff_file_name);
cGenomeDiff gd = mpgd;

// Write VCF conversion (to both data and output)
cerr << " Writing final VCF file..." << endl;
mpgd.write_vcf(settings.data_vcf_file_name, ref_seq_info);
mpgd.write_vcf(settings.output_vcf_file_name, ref_seq_info);
// Add metadata that will only be in the output.gd file

//#=REFSEQ header lines.
mpgd.metadata.ref_seqs = settings.all_reference_file_names;

//#=READSEQ header lines.
mpgd.metadata.read_seqs.resize(settings.read_files.size());
for (size_t i = 0; i < settings.read_files.size(); i++) {
mpgd.metadata.read_seqs[i] = settings.read_files[i].file_name();
}

// These fields will be overwritten by any header GenomeDiff provided
mpgd.metadata.title = settings.custom_run_name;

// Load metadata from existing header GenomeDiff file
// This is purposefully done AFTER the previous metadata defs
// in order to overwrite those (but not others)
if (settings.header_genome_diff_file_name.size() > 0) {
cGenomeDiff header_gd(settings.header_genome_diff_file_name);
mpgd.metadata = header_gd.metadata;
}

// Fields here and below will write over any values in the header GenomeDiff file
mpgd.metadata.program = string(PACKAGE_STRING) + " " + string(HG_REVISION);
mpgd.metadata.command = settings.full_command_line;
mpgd.metadata.created = Settings::time2string(time(NULL));

// Add additional SUMMARY metadata

mpgd.add_breseq_data("INPUT-READS", to_string(summary.sequence_conversion.num_original_reads));
mpgd.add_breseq_data("INPUT-BASES", to_string(summary.sequence_conversion.num_original_bases));
mpgd.add_breseq_data("CONVERTED-READS", to_string(summary.alignment_resolution.total_reads));
mpgd.add_breseq_data("CONVERTED-BASES", to_string(summary.alignment_resolution.total_bases));
mpgd.add_breseq_data("MAPPED-READS", to_string(static_cast<int64_t>(summary.alignment_resolution.total_reads_mapped_to_references)));
mpgd.add_breseq_data("MAPPED-BASES", to_string(summary.alignment_resolution.total_bases_mapped_to_references));

// Write the output.gd file
cerr << " Writing final GD file..." << endl;
mpgd.write(settings.final_genome_diff_file_name);

// Save a copy in the data folder as well
mpgd.write(settings.data_genome_diff_file_name);

// Faster, less compatible way...
// SYSTEM("cp " + settings.final_genome_diff_file_name + " " + settings.data_genome_diff_file_name);

// Save a copy with invisible fields in the annotation folder (this is what true does)
// We reload this to begin annotation if needed
mpgd.write(settings.preannotated_genome_diff_file_name, true);
settings.track_intermediate_file(settings.output_mutation_annotation_done_file_name, settings.preannotated_genome_diff_file_name);

// Write VCF conversion (to both data and output)
cerr << " Writing final VCF file..." << endl;
mpgd.write_vcf(settings.data_vcf_file_name, ref_seq_info);
mpgd.write_vcf(settings.output_vcf_file_name, ref_seq_info);

// Write a final JSON file with all summary information (to both data and output)
PublicSummary public_summary(summary, settings, ref_seq_info);
public_summary.store(settings.data_json_summary_file_name);
public_summary.store(settings.output_json_summary_file_name);
// Write a final JSON file with all summary information (to both data and output)
PublicSummary public_summary(summary, settings, ref_seq_info);
public_summary.store(settings.data_json_summary_file_name);
public_summary.store(settings.output_json_summary_file_name);


//
// Mark marginal items as no_show to prevent further processing
//
output::mark_gd_entries_no_show(settings, gd);

settings.done_step(settings.output_mutation_prediction_done_file_name);

//
// Annotate mutations
//
cerr << "Annotating mutations..." << endl;
ref_seq_info.annotate_mutations(gd);
}

// Annotated Genome Diff output #1 - public version
gd.write(settings.data_annotated_genome_diff_file_name);
cGenomeDiff gd;
if (settings.do_step(settings.output_mutation_annotation_done_file_name, "Output :: Mutation Annotation")) {

// Annotated Genome Diff output #2 - used internally and for consistency tests
//
// Here we empty certain metadata items: CREATED time and PROGRAM breseq version
// that should be allowed to change without marking the output as failed!
gd.metadata.created="";
gd.metadata.program="";
gd.write(settings.annotated_genome_diff_file_name);

// Might be faster to copy or use existing gd if already loaded?

// Reload if necessary... (ex: interrupted in this step)
// gd = mpgd;
// (gd.get_list().size() == 0) {
gd.read(settings.preannotated_genome_diff_file_name);
// Blank this out of the title unless it has been reassigned, for consistency
if (gd.metadata.title == cString(settings.preannotated_genome_diff_file_name).get_base_name_no_extension()) {
gd.metadata.title = "";
}
// }

//
// Mark marginal items as no_show to prevent further processing
//
output::mark_gd_entries_no_show(settings, gd);

//
// Annotate mutations
//
cerr << "Annotating mutations..." << endl;
ref_seq_info.annotate_mutations(gd);

// Annotated Genome Diff output #1 - public version
gd.write(settings.data_annotated_genome_diff_file_name);

// Annotated Genome Diff output #2 - used internally and for consistency tests
//
// Here we empty certain metadata items: CREATED time and PROGRAM breseq version
// that should be allowed to change without marking the output as failed!
gd.metadata.created="";
gd.metadata.program="";
gd.write(settings.annotated_genome_diff_file_name);

settings.done_step(settings.output_mutation_annotation_done_file_name);
}

//
// Plot coverage of genome and large deletions
//

// Reload if necessary... (ex: interrupted in this step)
if (gd.get_list().size() == 0) {
gd.read(settings.annotated_genome_diff_file_name);
// Blank this out of the title unless it has been reassigned, for consistency
if (gd.metadata.title == cString(settings.annotated_genome_diff_file_name).get_base_name_no_extension()) {
gd.metadata.title = "";
}
}

cerr << "Drawing coverage plots..." << endl;
output::draw_coverage(settings, ref_seq_info, gd);

Expand Down
6 changes: 3 additions & 3 deletions src/c/breseq/genome_diff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ cFileParseErrors cGenomeDiff::valid_with_reference_sequences(cReferenceSequences
1) If you want a diff entry to be commented out(prefix with '#') add the key
"comment_out" to the diff entry.
*/
void cGenomeDiff::write(const string& filename) {
void cGenomeDiff::write(const string& filename, bool include_unprintable_fields) {
string basename = cString(filename).get_base_name();
string dir = cString(filename).remove_ending(basename);
if (dir.size()) {
Expand Down Expand Up @@ -735,10 +735,10 @@ void cGenomeDiff::write(const string& filename) {

for(diff_entry_list_t::iterator it=_entry_list.begin(); it!=_entry_list.end(); ++it) {
if (!(*it)->entry_exists("comment_out")) {
fprintf(os, "%s\n", (**it).as_string().c_str());
fprintf(os, "%s\n", (**it).as_string(include_unprintable_fields).c_str());
} else {
(*it)->erase("comment_out");
fprintf(os, "#%s\n", (**it).as_string().c_str());
fprintf(os, "#%s\n", (**it).as_string(include_unprintable_fields).c_str());
}
}
os.close();
Expand Down
8 changes: 4 additions & 4 deletions src/c/breseq/genome_diff_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1185,7 +1185,7 @@ namespace breseq {

/*! Marshal this diff entry into an ordered list of fields.
*/
void cDiffEntry::marshal(vector<string>& s) const {
void cDiffEntry::marshal(vector<string>& s, bool include_unprintable_fields) const {
s.push_back(gd_entry_type_lookup_table[_type]);
s.push_back(_id);

Expand Down Expand Up @@ -1221,7 +1221,7 @@ namespace breseq {
for(diff_entry_map_t::iterator i=cp.begin(); i!=cp.end(); ++i) {

assert(i->first.size());
if (is_unprintable_key(i->first)) continue;
if (!include_unprintable_fields && is_unprintable_key(i->first)) continue;
if (i->second.empty()) continue;

// Be sure the entry is non-empty! Would rather have this as a check.
Expand All @@ -1234,10 +1234,10 @@ namespace breseq {
}

// Created the line to be printed
string cDiffEntry::as_string(void) const
string cDiffEntry::as_string(bool include_unprintable_fields) const
{
vector<string> fields;
marshal(fields);
marshal(fields, include_unprintable_fields);
return join(fields, "\t");
}

Expand Down
4 changes: 2 additions & 2 deletions src/c/breseq/libbreseq/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -1257,8 +1257,8 @@ inline cString cString::get_base_name() const

//! Returns file name with no extension, removes any directory path beforehand.
// input.1.gd
// remove_all_extensions = FALSE removes everything past the first period => input
// remove_all_extensions = TRUE removes only the last period and beyond => input.1
// remove_all_extensions = TRUE removes everything past the first period => input
// remove_all_extensions = FALSE removes only the last period and beyond => input.1
inline cString cString::get_base_name_no_extension(bool remove_all_extensions, bool one_name_for_pair) const
{
cString this_return = this->get_base_name();
Expand Down
2 changes: 1 addition & 1 deletion src/c/breseq/libbreseq/genome_diff.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class cGenomeDiff
cFileParseErrors valid_with_reference_sequences(cReferenceSequences& ref_seq, bool suppress_errors = false);

//! Write the genome diff to a file.
void write(const string& filename);
void write(const string& filename, bool include_unprintable_fields=false);


//!---- Adding and Removing Entries ---- !//
Expand Down
4 changes: 2 additions & 2 deletions src/c/breseq/libbreseq/genome_diff_entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,10 +325,10 @@ namespace breseq {
//!---- Output ---- !//

//! Marshal this diff entry into an ordered list of fields.
virtual void marshal(vector<string>& s) const;
virtual void marshal(vector<string>& s, bool include_unprintable_fields=false) const;

//! Serialize this diff entry into a string for output.
virtual string as_string(void) const;
virtual string as_string(bool include_unprintable_fields=false) const;

//! Output all keys and values
string as_key_values() const {
Expand Down
Loading

0 comments on commit e0a5c1d

Please sign in to comment.