Skip to content

Commit

Permalink
Added default filtering of reads ≤18 bp; exposed read filtering optio…
Browse files Browse the repository at this point in the history
…ns at the command line
  • Loading branch information
jeffreybarrick committed May 16, 2017
1 parent 7f6a179 commit 5a7ec72
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 35 deletions.
24 changes: 20 additions & 4 deletions src/c/breseq/breseq_cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,12 @@ int do_convert_fastq(int argc, char* argv[])

uint64_t original_num_reads;
uint64_t original_num_bases;
uint32_t min_read_length;
uint32_t max_read_length;
uint8_t min_quality_score;
uint8_t max_quality_score;

input_format = cFastqQualityConverter::predict_fastq_file_format(input_file_name, original_num_reads, original_num_bases, max_read_length, min_quality_score, max_quality_score);
input_format = cFastqQualityConverter::predict_fastq_file_format(input_file_name, original_num_reads, original_num_bases, min_read_length, max_read_length, min_quality_score, max_quality_score);

cerr << " Predicted input format : " << input_format << endl;;
}
Expand Down Expand Up @@ -1109,10 +1110,11 @@ int do_assemble_unmatched(int argc, char* argv[])
// Figure out format (but just check one of the files to do this)
uint64_t original_num_reads;
uint64_t original_num_bases;
uint32_t min_read_length;
uint32_t max_read_length;
uint8_t min_quality_score;
uint8_t max_quality_score;
string quality_format = cFastqQualityConverter::predict_fastq_file_format(read_file_name_1, original_num_reads, original_num_bases, max_read_length, min_quality_score, max_quality_score);
string quality_format = cFastqQualityConverter::predict_fastq_file_format(read_file_name_1, original_num_reads, original_num_bases, min_read_length, max_read_length, min_quality_score, max_quality_score);

cFastqQualityConverter fqc(quality_format, "SANGER");
cFastqQualityConverter unmatched_fqc("SANGER", "SANGER");
Expand Down Expand Up @@ -1198,6 +1200,7 @@ int breseq_default_action(int argc, char* argv[])
if (!settings.aligned_sam_mode) {

//Check the FASTQ format and collect some information about the input read files at the same time
uint32_t overall_min_read_length = UNDEFINED_UINT32;
uint32_t overall_max_read_length = UNDEFINED_UINT32;
uint32_t overall_max_qual = 0;

Expand Down Expand Up @@ -1227,14 +1230,26 @@ int breseq_default_action(int argc, char* argv[])
string convert_file_name = settings.file_name(settings.converted_fastq_file_name, "#", base_name);

// Parse output
Summary::AnalyzeFastq s_rf = normalize_fastq(fastq_file_name, convert_file_name, i+1, settings.quality_score_trim, !settings.no_read_filtering, s.num_bases, read_file_base_limit);
Summary::AnalyzeFastq s_rf = normalize_fastq(fastq_file_name,
convert_file_name,
i+1,
settings.quality_score_trim,
!settings.no_read_filtering,
s.num_bases,
read_file_base_limit,
settings.read_file_min_read_length,
settings.read_file_max_same_base_fraction,
settings.read_file_max_N_fraction
);
settings.track_intermediate_file(settings.alignment_correction_done_file_name, convert_file_name);

// Save the converted file name -- have to save it in summary because only that
// is reloaded if we skip this step.
s.converted_fastq_name[base_name] = s_rf.converted_fastq_name;

// Record statistics
if ((overall_min_read_length == UNDEFINED_UINT32) || (s_rf.min_read_length > overall_min_read_length))
overall_min_read_length = s_rf.max_read_length;
if ((overall_max_read_length == UNDEFINED_UINT32) || (s_rf.max_read_length > overall_max_read_length))
overall_max_read_length = s_rf.max_read_length;
if ((overall_max_qual == UNDEFINED_UINT32) || (s_rf.max_quality_score > overall_max_qual))
Expand All @@ -1248,6 +1263,7 @@ int breseq_default_action(int argc, char* argv[])
}

s.avg_read_length = static_cast<double>(s.num_bases) / static_cast<double>(s.num_reads);
s.min_read_length = overall_min_read_length;
s.max_read_length = overall_max_read_length;
s.max_qual = overall_max_qual;
summary.sequence_conversion = s;
Expand All @@ -1256,7 +1272,7 @@ int breseq_default_action(int argc, char* argv[])
cerr << " ::TOTAL::" << endl;
cerr << " Original reads: " << s.original_num_reads << " bases: " << s.original_num_bases << endl;
cerr << " Analyzed reads: " << s.num_reads << " bases: " << s.num_bases << endl;

}


Expand Down
85 changes: 68 additions & 17 deletions src/c/breseq/fastq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ namespace breseq {
const int32_t trim_end_on_base_quality,
const bool filter_reads,
uint64_t current_read_file_bases,
const uint64_t read_file_base_limit
)
const uint64_t read_file_base_limit,
const uint32_t _min_read_length,
const double _max_same_base_fraction,
const double _max_N_fraction
)
{
cerr << " Converting/filtering FASTQ file..." << endl;

Expand All @@ -49,19 +52,26 @@ namespace breseq {

// Summary information that will be printed at the end
uint32_t max_read_length;
uint32_t min_read_length;
uint8_t min_quality_score;
uint8_t max_quality_score;
uint64_t original_num_bases;
uint64_t original_num_reads;

// Predict the format (and count original stats)
string quality_format = cFastqQualityConverter::predict_fastq_file_format(file_name, original_num_reads, original_num_bases, max_read_length, min_quality_score, max_quality_score);
string quality_format = cFastqQualityConverter::predict_fastq_file_format(file_name, original_num_reads, original_num_bases, min_read_length, max_read_length, min_quality_score, max_quality_score);

uint64_t num_bases = 0;
uint64_t num_reads = 0;
uint64_t homopolymer_filtered_reads = 0;

uint64_t min_length_filtered_reads = 0;
uint64_t same_base_filtered_reads = 0;
uint64_t N_filtered_reads = 0;

uint64_t min_length_filtered_bases = 0;
uint64_t same_base_filtered_bases = 0;
uint64_t N_filtered_bases = 0;

//debug
//cerr << "min_quality_score " << (int)min_quality_score << endl;
//cerr << "max_quality_score " << (int)max_quality_score << endl;
Expand All @@ -70,6 +80,9 @@ namespace breseq {

//std::cout << m_quality_format << std::endl;

uint32_t width_for_reads = to_string(original_num_reads).size();
uint32_t width_for_bases = to_string(original_num_bases).size();

cerr << " Original base quality format: " << quality_format << " New format: SANGER"<< endl;
cerr << " Original reads: " << original_num_reads << " bases: "<< original_num_bases << endl;

Expand Down Expand Up @@ -97,25 +110,34 @@ namespace breseq {

if ( filter_reads ) {

// Discard sequences that are too short
if ( _min_read_length && (on_sequence.length() < _min_read_length) ) {
min_length_filtered_reads++;
min_length_filtered_bases += on_sequence.length();
continue;
}

// Discard sequences that are 50% or more N.
if ( 0.5 * static_cast<double>(on_sequence.length()) <= static_cast<double>(on_sequence.m_base_counts[base_list_N_index]) ) {
if ( _max_N_fraction && (_max_N_fraction * static_cast<double>(on_sequence.length()) <= static_cast<double>(on_sequence.m_base_counts[base_list_N_index]) )) {
N_filtered_reads++;
N_filtered_bases += on_sequence.length();
continue;
}

// Ignore heavily homopolymer reads, as these are a common type of machine error
// Discard sequences that are 90% or more of a single base or N.
bool homopolymer_filtered = false;
bool same_base_filtered = false;
for (uint8_t i=0; i<base_list_including_N_size; i++) {
if ( 0.9 * static_cast<double>(on_sequence.length()) <=
if ( _max_same_base_fraction && (_max_same_base_fraction * static_cast<double>(on_sequence.length())) <=
static_cast<double>(on_sequence.m_base_counts[i] + on_sequence.m_base_counts[base_list_N_index]) ) {
homopolymer_filtered = true;
same_base_filtered = true;
break;
}
}

if (homopolymer_filtered) {
homopolymer_filtered_reads++;
if (same_base_filtered) {
same_base_filtered_reads++;
same_base_filtered_bases += on_sequence.length();
continue;
}

Expand Down Expand Up @@ -169,19 +191,45 @@ namespace breseq {
//cerr << "min_quality_score " << (int)min_quality_score << endl;
//cerr << "max_quality_score " << (int)max_quality_score << endl;

string display_same_base_percent = to_string(_max_same_base_fraction);
string display_N_percent = to_string(_max_N_fraction);

if (filter_reads) {
cerr << " Filtered reads: " << N_filtered_reads << " (≥50% N) " << homopolymer_filtered_reads << " (≥90% one base)" << endl;
if (N_filtered_reads + same_base_filtered_reads + min_length_filtered_reads == 0) {
cerr << " Filtered reads: none" << endl;
} else {

if (min_length_filtered_reads) {
cerr << " Filtered reads: " << setw(width_for_reads) << min_length_filtered_reads;
cerr << " bases: "<< setw(width_for_bases) << min_length_filtered_bases;
cerr << " (<" << _min_read_length << " bases long)" << endl;
}
if (N_filtered_reads) {
string percentage = formatted_double(100 * _max_N_fraction, 1).to_string();
cerr << " Filtered reads: " << setw(width_for_reads) << N_filtered_reads;
cerr << " bases: "<< setw(width_for_bases) << N_filtered_bases;
cerr << " (≥" << percentage << "% bases N)" << endl;
}
if (same_base_filtered_reads) {
string percentage = formatted_double(100 * _max_same_base_fraction, 1).to_string();
cerr << " Filtered reads: " << setw(width_for_reads) << same_base_filtered_reads;
cerr << " bases: "<< setw(width_for_bases) << same_base_filtered_bases;
cerr << " (≥" << percentage << "% same base)" << endl;
}
}
}
cerr << " Analyzed reads: " << num_reads << " bases: "<< num_bases << endl;


double avg_read_length = static_cast<double>(num_bases) / static_cast<double>(num_reads);

Summary::AnalyzeFastq retval(
min_read_length,
max_read_length,
avg_read_length,
original_num_reads,
homopolymer_filtered_reads,
min_length_filtered_reads,
same_base_filtered_reads,
N_filtered_reads,
num_reads,
min_quality_score,
Expand Down Expand Up @@ -316,11 +364,12 @@ namespace breseq {
}
}

string cFastqQualityConverter::predict_fastq_file_format(const string& file_name, uint64_t& original_num_reads, uint64_t& original_num_bases, uint32_t& max_read_length, uint8_t& min_quality_score, uint8_t& max_quality_score)
string cFastqQualityConverter::predict_fastq_file_format(const string& file_name, uint64_t& original_num_reads, uint64_t& original_num_bases, uint32_t& min_read_length, uint32_t& max_read_length, uint8_t& min_quality_score, uint8_t& max_quality_score)
{
// Initialize the input variables!
original_num_reads = 0;
original_num_bases = 0;
min_read_length = numeric_limits<uint32_t>::max();
max_read_length = 0;
min_quality_score = 255;
max_quality_score = 0;
Expand All @@ -343,10 +392,12 @@ namespace breseq {
original_num_reads++;

//check sequence length
if( on_sequence.m_sequence.size() > max_read_length ) max_read_length = on_sequence.m_sequence.size();

//add current sequence length to number of bases
original_num_bases += on_sequence.m_sequence.size();
min_read_length = min<uint32_t>(min_read_length, on_sequence.m_sequence.size());
max_read_length = max<uint32_t>(max_read_length, on_sequence.m_sequence.size());


//add current sequence length to number of bases
original_num_bases += on_sequence.m_sequence.size();

//iterate through sequence grabbing the associated scores
for (uint32_t i=0; i<on_sequence.m_qualities.size(); i++) {
Expand Down
4 changes: 2 additions & 2 deletions src/c/breseq/gdtools_cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1827,9 +1827,9 @@ int do_read_count(int argc, char* argv[])
uout << "Reading input FASTQ file: " << options.getArgv(i) << endl;;

uint64_t original_num_reads, original_num_bases;
uint32_t max_read_length;
uint32_t min_read_length, max_read_length;
uint8_t min_quality_score, max_quality_score;
string quality_format = cFastqQualityConverter::predict_fastq_file_format(options.getArgv(i), original_num_reads, original_num_bases, max_read_length, min_quality_score, max_quality_score);
string quality_format = cFastqQualityConverter::predict_fastq_file_format(options.getArgv(i), original_num_reads, original_num_bases, min_read_length, max_read_length, min_quality_score, max_quality_score);

uout << " Reads: " << original_num_reads << endl;
uout << " Bases: " << original_num_bases << endl;
Expand Down
7 changes: 5 additions & 2 deletions src/c/breseq/libbreseq/fastq.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ class cAnnotatedSequence;
const int32_t trim_end_on_base_quality,
const bool filter_reads,
uint64_t current_read_file_bases,
const uint64_t read_file_base_limit
const uint64_t read_file_base_limit,
const uint32_t min_read_length,
const double max_same_base_fraction,
const double max_N_fraction
);

// Utility function for converting FASTQ files between formats
Expand Down Expand Up @@ -152,7 +155,7 @@ class cAnnotatedSequence;

void convert_sequence(cFastqSequence &seq);

static string predict_fastq_file_format(const string& file_name, uint64_t& original_num_reads, uint64_t& original_num_bases, uint32_t& max_read_length, uint8_t& min_quality_score, uint8_t& max_quality_score);
static string predict_fastq_file_format(const string& file_name, uint64_t& original_num_reads, uint64_t& original_num_bases, uint32_t& min_read_length, uint32_t& max_read_length, uint8_t& min_quality_score, uint8_t& max_quality_score);

};

Expand Down
9 changes: 6 additions & 3 deletions src/c/breseq/libbreseq/settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,12 @@ namespace breseq
string base_output_path; // Default = cwd COMMAND-LINE OPTION

//! Settings: Read File Options
vector<string> read_file_names; // REQUIRED COMMAND-LINE OPTION
double read_file_coverage_fold_limit; // Default = 0 (OFF) COMMAND-LINE OPTION
bool aligned_sam_mode; // Default = false COMMAND-LINE OPTION
vector<string> read_file_names; // REQUIRED COMMAND-LINE OPTION
bool aligned_sam_mode; // Default = false COMMAND-LINE OPTION
double read_file_coverage_fold_limit; // Default = 0 (OFF) COMMAND-LINE OPTION
uint32_t read_file_min_read_length; // Default = 18 COMMAND-LINE OPTION
double read_file_max_same_base_fraction; // Default = 0.9 COMMAND-LINE OPTION
double read_file_max_N_fraction; // Default = 0.5 COMMAND-LINE OPTION

// Reference sequences
vector<string> all_reference_file_names; // REQUIRED COMMAND-LINE OPTION (filled by below)
Expand Down
Loading

0 comments on commit 5a7ec72

Please sign in to comment.