Skip to content

Commit

Permalink
Fixes for gdtools UNION and SUBTRACT to deal with phylogeny correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffreybarrick committed Nov 16, 2015
1 parent c529c3c commit e93f634
Show file tree
Hide file tree
Showing 5 changed files with 280 additions and 242 deletions.
57 changes: 6 additions & 51 deletions src/c/breseq/gdtools_cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,15 @@ int do_union(int argc, char *argv[])
return -1;
}

/*
if (options.getArgc() < 1) {
options.addUsage("");
options.addUsage("Must provide at least two input Genome Diff files.");
options.printUsage();
return -1;
}
*/
UserOutput uout("UNION");
cout << endl << " Preserving: " << (!options.count("evidence") ? "Mutations (3-letter codes)" : "Evidence (2-letter codes)") << endl;

Expand Down Expand Up @@ -244,6 +247,7 @@ int do_subtract(int argc, char *argv[])
AnyOption options("gdtools SUBTRACT [-o output.gd] input.gd subtract1.gd [subtract2.gd ...]");
options("help,h", "Display detailed help message", TAKES_NO_ARGUMENT);
options("output,o", "output GD file", "output.gd");
options("phylogeny-aware,p", "Check the optional 'phylogeny_id' field when deciding if entries are equivalent", TAKES_NO_ARGUMENT);
options("verbose,v", "verbose mode", TAKES_NO_ARGUMENT);
options.processCommandArgs(argc, argv);

Expand Down Expand Up @@ -276,7 +280,7 @@ int do_subtract(int argc, char *argv[])
for (int32_t i = 1; i < options.getArgc(); ++i) {
uout << options.getArgv(i) << endl;
cGenomeDiff gd2(options.getArgv(i));
gd1.set_subtract(gd2, verbose);
gd1.set_subtract(gd2, options.count("phylogeny-aware"), verbose);
}

uout("Writing output GD file", options["output"]);
Expand All @@ -285,55 +289,6 @@ int do_subtract(int argc, char *argv[])
return 0;
}

/* @JEB: Merge doesn't really make sense when we now require
int do_merge(int argc, char *argv[])
{
AnyOption options("gdtools MERGE [-o output.gd] input1.gd input2.gd ...");
options("output,o", "output GD file", "output.gd");
options("unique,u", "unique entries only", TAKES_NO_ARGUMENT);
options("id,i", "reorder IDs", TAKES_NO_ARGUMENT);
options("verbose,v", "verbose mode", TAKES_NO_ARGUMENT);
options.processCommandArgs(argc, argv);
options.addUsage("");
options.addUsage("Input as many GenomeDiff files as you want, and have them");
options.addUsage("merged together into a single GenomeDiff file specified.");
options.addUsage("Unique IDs will remain unique across files. Any IDs that");
options.addUsage("aren't unique will be assigned new ones.");
options.addUsage("");
options.addUsage("Header information will be inherited from the first input file,");
options.addUsage("so this function can also be used to transfer metadata to a new file.");
if (options.getArgc() < 2) {
options.addUsage("");
options.addUsage("At least two input Genome Diff files must be provided.");
options.printUsage();
return -1;
}
const bool verbose = options.count("verbose");
UserOutput uout("MERGE");
uout("Merging input GD files") << options.getArgv(0) << endl;
cGenomeDiff gd1(options.getArgv(0));
//Load all the GD files that were input.
for(int32_t i = 1; i < options.getArgc(); i++)
{
uout << options.getArgv(i) << endl;;
cGenomeDiff gd2(options.getArgv(i));
gd1.merge(gd2, options.count("unique"), options.count("id"), options.count("verbose"));
}
uout("Writing output GD file", options["output"]);
gd1.write(options["output"]);
return 0;
}
*/

int do_weights(int argc, char* argv[])
{
AnyOption options("gdtools WEIGHTS [-o output.gd input1.gd input2.gd ...");
Expand Down Expand Up @@ -1445,7 +1400,7 @@ int do_normalize_gd(int argc, char* argv[])
cGenomeDiff apply_gd(input);

new_ref_seq_info = cReferenceSequences::deep_copy(ref_seq_info);
apply_gd.apply_to_sequences(ref_seq_info, new_ref_seq_info, false, kDistanceToRepeat, settings.size_cutoff_AMP_becomes_INS_DEL_mutation);
apply_gd.apply_to_sequences(ref_seq_info, new_ref_seq_info, options.count("verbose"), kDistanceToRepeat, settings.size_cutoff_AMP_becomes_INS_DEL_mutation);


// Now transfer between and mediated tags to ones with the same IDs
Expand Down
155 changes: 122 additions & 33 deletions src/c/breseq/genome_diff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2628,10 +2628,27 @@ bool cGenomeDiff::mutation_in_entry_of_type(cDiffEntry mut, const gd_entry_type

//! Subtract mutations
//Note: Preserves evidence in the file being subtracted from
void cGenomeDiff::set_subtract(cGenomeDiff& gd, bool verbose)
void cGenomeDiff::set_subtract(cGenomeDiff& gd, bool phylogeny_id_aware, bool verbose)
{
(void)verbose; //unused

// Temporarily delete 'phylogeny_id' if we are not phylogeny aware
if (!phylogeny_id_aware) {
for (diff_entry_list_t::iterator it_new = gd._entry_list.begin(); it_new != gd._entry_list.end(); it_new++) {

// Save this info in field that won't affect comparisons
if ((*it_new)->entry_exists("phylogeny_id")) {
(**it_new)["_phylogeny_id"] = (**it_new)["phylogeny_id"];
}
if ((*it_new)->entry_exists("population_id")) {
(**it_new)["_population_id"] = (**it_new)["population_id"];
}

(*it_new)->erase("phylogeny_id");
(*it_new)->erase("population_id");
}
}

set<cDiffEntry> seen;
diff_entry_list_t muts = gd.mutation_list();
for (diff_entry_list_t::iterator it = muts.begin(); it != muts.end(); ++it) {
Expand All @@ -2651,16 +2668,49 @@ void cGenomeDiff::set_subtract(cGenomeDiff& gd, bool verbose)

//if (verbose) cout << entry << endl;

if (!phylogeny_id_aware) {
// Save this info in field that won't affect comparisons
if (entry.entry_exists("phylogeny_id")) {
entry["_phylogeny_id"] = entry["phylogeny_id"];
}
if (entry.entry_exists("population_id")) {
entry["_population_id"] = entry["population_id"];
}
}

entry.erase("phylogeny_id");
entry.erase("population_id");

//Subtract mutations that we've seen
if(entry.is_mutation() && seen.count(entry)) {
it = _entry_list.erase(it);
it_advance = false;
} else if (!phylogeny_id_aware) {
if (entry.entry_exists("_phylogeny_id")) {
entry["phylogeny_id"] = entry["_phylogeny_id"];
}
if (entry.entry_exists("_population_id")) {
entry["population_id"] = entry["_population_id"];
}
}

// Iterate it ONLY if we haven't erased something.
if(it_advance)it++;
}

// Restore fields that we hid if we are not phylogeny aware
if (!phylogeny_id_aware) {
for (diff_entry_list_t::iterator it_new = _entry_list.begin(); it_new != _entry_list.end(); it_new++) {

if ((*it_new)->entry_exists("_phylogeny_id")) {
(**it_new)["phylogeny_id"] = (**it_new)["_phylogeny_id"];
}
if ((*it_new)->entry_exists("_population_id")) {
(**it_new)["population_id"] = (**it_new)["_population_id"];
}
}
}

reassign_unique_ids();
}

Expand Down Expand Up @@ -2715,7 +2765,7 @@ void cGenomeDiff::set_union(cGenomeDiff& gd, bool evidence_mode, bool phylogeny_
this->remove_group(cGenomeDiff::VALIDATION);

// Merge and clear away all evidence reference
merge(gd, true, true, phylogeny_aware);
merge(gd, true, true, phylogeny_aware, verbose);

if (!evidence_mode)
this->remove_group(cGenomeDiff::EVIDENCE);
Expand Down Expand Up @@ -2828,34 +2878,48 @@ void cGenomeDiff::merge(cGenomeDiff& merge_gd, bool unique, bool new_id, bool ph

cGenomeDiff new_gd;

// Delete 'phylogeny_id' if we are not phylogeny aware
if (!phylogeny_id_aware) {
for (diff_entry_list_t::iterator it_new = merge_gd._entry_list.begin(); it_new != merge_gd._entry_list.end(); it_new++) {

(*it_new)->erase("phylogeny_id");
(*it_new)->erase("population_id");

}
}

// Add population info if we are phylogeny aware

// Add population info if we are phylogeny aware, we do this outside the
// loop because changing it shouldn't be able to disambiguoate any entries
// unlike phylogeny_id and population_id when in !phylogeny_id_aware mode
if (phylogeny_id_aware) {

diff_entry_list_t mut_list = this->mutation_list();
for (diff_entry_list_t::iterator it_new = merge_gd._entry_list.begin(); it_new != merge_gd._entry_list.end(); it_new++) {
(**it_new)["population_id"] = merge_gd.metadata.population;

diff_entry_list_t mut_list = merge_gd.mutation_list();
for (diff_entry_list_t::iterator it_new = mut_list.begin(); it_new != mut_list.end(); it_new++) {
if (merge_gd.metadata.population.size() != 0) {
(**it_new)["population_id"] = merge_gd.metadata.population;
}
}
}



//Iterate through all the potential new entries
for (diff_entry_list_t::iterator it_new = merge_gd._entry_list.begin(); it_new != merge_gd._entry_list.end(); it_new++)
{
//The current potential new entry we're looking at
cDiffEntry& entry_new = **it_new;


// Make a copy of the entry so that we are not changing
// the original GD at all!

// Allocating diff_entry_ptr_t takes care of disposal
cDiffEntry* diff_entry_copy = new cDiffEntry(**it_new);
diff_entry_ptr_t added_item(diff_entry_copy);

cDiffEntry& entry_new = *diff_entry_copy;

// Ditch the extra baggage, but only in the copy
if (!phylogeny_id_aware) {

// Save this info in field that won't affect comparisons
if (entry_new.entry_exists("phylogeny_id")) {
entry_new["_phylogeny_id"] = entry_new["phylogeny_id"];
}
if (entry_new.entry_exists("population_id")) {
entry_new["_population_id"] = entry_new["population_id"];
}

entry_new.erase("phylogeny_id");
entry_new.erase("population_id");
}

bool new_entry = true;

//Iterate through all the entries in the current list.
Expand All @@ -2880,7 +2944,7 @@ void cGenomeDiff::merge(cGenomeDiff& merge_gd, bool unique, bool new_id, bool ph
new_gd.add(entry_new, new_id);

//Notify user of new entry
if(verbose)cout << "\tNEW ENTRY\t" << entry_new._id << " >> " << _entry_list.back()->_id << "\t" << merge_gd.get_file_path() << endl;
//if(verbose)cout << "\tNEW ENTRY\t" << entry_new.as_string() << "\t" << merge_gd.get_file_path() << endl;
}
}

Expand Down Expand Up @@ -2909,7 +2973,7 @@ void cGenomeDiff::merge(cGenomeDiff& merge_gd, bool unique, bool new_id, bool ph
if((**it_cur) == (**it_new))
{
//Notify user of the update
if(verbose)cout << "\tID: " << (**it)._id << "\tEVIDENCE\t" << (**it)._evidence[iter] << " >> " << (**it_cur)._id << endl;
//if(verbose)cout << "\tID: " << (**it)._id << "\tEVIDENCE\t" << (**it)._evidence[iter] << " >> " << (**it_cur)._id << endl;

//Change the evidence ID to it's new ID in the new updated list
(**it)._evidence[iter] = (**it_cur)._id;
Expand All @@ -2925,24 +2989,41 @@ void cGenomeDiff::merge(cGenomeDiff& merge_gd, bool unique, bool new_id, bool ph
if(!found_match)
{
//Notify user of the update
if(verbose)cout << "\tID: " << (**it)._id << "\tEVIDENCE \t" << (**it)._evidence[iter] << " >> REMOVED" << endl;
//if(verbose)cout << "\tID: " << (**it)._id << "\tEVIDENCE \t" << (**it)._evidence[iter] << " >> REMOVED" << endl;
(**it)._evidence.erase((**it)._evidence.begin() + iter--);
}
}
}

// Add the ones from the new list to the current list
for (diff_entry_list_t::iterator it=new_gd._entry_list.begin(); it!= new_gd._entry_list.end(); it++) {
(**it)["_just_added"] = "1";

// Restore fields that we hid if we are not phylogeny aware
// this is necessary now so that we will match the proper first entries
if (!phylogeny_id_aware) {
if ((**it).entry_exists("_phylogeny_id")) {
(**it)["phylogeny_id"] = (**it)["_phylogeny_id"];
}
if ((**it).entry_exists("_population_id")) {
(**it)["population_id"] = (**it)["_population_id"];
}
}

add(**it);
}

// Fix optional fields that refer to mutation IDs
// Must be done after adding entries, because a before= or within= could have existed in the original GD (so not in the added list)
for (diff_entry_list_t::iterator it=new_gd._entry_list.begin(); it!= new_gd._entry_list.end(); it++) {
for (diff_entry_list_t::iterator it=_entry_list.begin(); it!= _entry_list.end(); it++) {

cDiffEntry& mut = **it;

if (!mut.is_mutation()) continue;
if (!mut.entry_exists("_just_added")) continue;

// Very important to erase for multiple merges!
mut.erase("_just_added");

for (vector<string>::const_iterator key_it=gd_keys_with_ids.begin(); key_it!= gd_keys_with_ids.end(); key_it++) {
if (mut.entry_exists(*key_it)) {
Expand All @@ -2951,17 +3032,22 @@ void cGenomeDiff::merge(cGenomeDiff& merge_gd, bool unique, bool new_id, bool ph
// Replace first value = mutation_id with substituted id
vector<string> split_value = split(value, ":");

// looking
// looking for original entry
// ---> note that this is unchanged for phylogeny mode
diff_entry_ptr_t looking_for = merge_gd.find_by_id(split_value[0]);

if (looking_for.get() == NULL) continue;

//cout << "Looking for: " << looking_for->as_string() << endl;

// iterate through all items to find the match
bool found_match = false;

//Iterate through all the current entries to find the match
for (diff_entry_list_t::iterator it_cur =_entry_list.begin(); it_cur != _entry_list.end(); it_cur++) {
if (**it_cur == *looking_for) {
//cout << "Found: " << (*it_cur)->as_string() << endl;

split_value[0] = to_string((**it_cur)._id);
found_match = true;
break;
Expand All @@ -2982,6 +3068,8 @@ void cGenomeDiff::merge(cGenomeDiff& merge_gd, bool unique, bool new_id, bool ph
}
}
}




//Notify user of the update
Expand Down Expand Up @@ -3228,9 +3316,13 @@ void cGenomeDiff::sort_and_check_for_duplicates(cFileParseErrors* file_parse_err
this->sort();

diff_entry_list_t::iterator it2;


for (diff_entry_list_t::iterator it1 = this->_entry_list.begin(); it1 != this->_entry_list.end(); it1++) {

//cout << (*it1)->as_string() << endl;
if ((*it1)->is_validation() && ((*it1)->_type != MASK) ) {
continue;
}

it2 = it1;
it2++;
Expand Down Expand Up @@ -4057,8 +4149,6 @@ void cGenomeDiff::apply_to_sequences(cReferenceSequences& ref_seq_info, cReferen
//cout << mut.as_string() << endl;
mediated_string = ref_seq_info.repeat_family_sequence(mut[MEDIATED], from_string<int16_t>(mut[MEDIATED_STRAND]), mut.entry_exists("mob_region") ? &mut["mob_region"] : NULL, &seq_id_picked, &repeat_feature_picked);
}



// We insert the pieces one at a time, so that we can update annotations
cLocation duplicated_region;
Expand Down Expand Up @@ -6117,10 +6207,9 @@ void cGenomeDiff::GD2OLI(const vector<string> &gd_file_names,
cout << " " << *it << endl;

cGenomeDiff this_gd(*it);

gd_list.push_back(this_gd);

master_gd.merge(this_gd, true, false, phylogeny_aware); // last true is to make phylogeny aware
master_gd.merge(this_gd, true, false, phylogeny_aware); // last true is to make phylogeny aware within populations
}

cGenomeDiff::sort_gd_list_by_treatment_population_time(gd_list);
Expand Down
2 changes: 1 addition & 1 deletion src/c/breseq/libbreseq/genome_diff.h
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ class cGenomeDiff
//!---- Set Operations ---- !//

//! Subtract mutations using gd_ref as reference.
void set_subtract(cGenomeDiff& gd_ref, bool verbose=false);
void set_subtract(cGenomeDiff& gd_ref, bool phylogeny_id_aware, bool verbose=false);

void set_intersect(cGenomeDiff& gd_ref, bool verbose=false);

Expand Down
Loading

0 comments on commit e93f634

Please sign in to comment.