From 3ede289553bc4f532cd78884f3aff2a166f534a9 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 13 Oct 2017 16:54:08 -0400 Subject: [PATCH 01/86] Bumps version. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c74a0e3..7dcebc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ find_package(Threads) #get_target_property(SHRINKWRAP_LIBS shrinkwrap INTERFACE_LINK_LIBRARIES) -add_definitions(-DSAVVY_VERSION="${PROJECT_VERSION}-alpha") +add_definitions(-DSAVVY_VERSION="${PROJECT_VERSION}-alpha2") set(HEADER_FILES include/savvy/savvy.hpp include/savvy/reader.hpp include/savvy/m3vcf_reader.hpp include/savvy/sav_reader.hpp include/savvy/allele_status.hpp include/savvy/vcf_reader.hpp include/savvy/varint.hpp include/savvy/s1r.hpp include/savvy/site_info.hpp include/savvy/variant_iterator.hpp include/savvy/portable_endian.hpp include/savvy/region.hpp include/savvy/compressed_vector.hpp include/savvy/eigen3_vector.hpp include/savvy/ublas_vector.hpp include/savvy/armadillo_vector.hpp include/savvy/utility.hpp include/savvy/data_format.hpp) From 73999b20c534806ceb44e6838845c7b8e807de09 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Mon, 16 Oct 2017 17:34:45 -0400 Subject: [PATCH 02/86] Deprecates site_info::locus in favor of site_info::position. --- include/savvy/region.hpp | 8 ++++---- include/savvy/sav_reader.hpp | 2 +- include/savvy/site_info.hpp | 9 +++++---- include/savvy/vcf_reader.hpp | 2 +- src/bcf2m3vcf.cpp | 4 ++-- src/sav_reader.cpp | 4 ++-- src/site_info.cpp | 2 +- src/test.cpp | 6 +++--- 8 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/savvy/region.hpp b/include/savvy/region.hpp index ec7a432..cd5e6a1 100644 --- a/include/savvy/region.hpp +++ b/include/savvy/region.hpp @@ -42,7 +42,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - return (var.locus() <= reg.to() && (var.locus() + std::max(var.ref().size(), var.alt().size()) - 1) >= reg.from() && var.chromosome() == reg.chromosome()); + return (var.position() <= reg.to() && (var.position() + std::max(var.ref().size(), var.alt().size()) - 1) >= reg.from() && var.chromosome() == reg.chromosome()); } }; @@ -50,7 +50,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - return (var.locus() >= reg.from() && (var.locus() + std::max(var.ref().size(), var.alt().size()) - 1) <= reg.to() && var.chromosome() == reg.chromosome()); + return (var.position() >= reg.from() && (var.position() + std::max(var.ref().size(), var.alt().size()) - 1) <= reg.to() && var.chromosome() == reg.chromosome()); } }; @@ -58,7 +58,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - return (var.locus() >= reg.from() && var.locus() <= reg.to() && var.chromosome() == reg.chromosome()); + return (var.position() >= reg.from() && var.position() <= reg.to() && var.chromosome() == reg.chromosome()); } }; @@ -66,7 +66,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - std::uint64_t right = (var.locus() + std::max(var.ref().size(), var.alt().size()) - 1); + std::uint64_t right = (var.position() + std::max(var.ref().size(), var.alt().size()) - 1); return (right >= reg.from() && right <= reg.to() && var.chromosome() == reg.chromosome()); } }; diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 244bb0e..14709eb 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -824,7 +824,7 @@ namespace savvy varint_encode(annotations.chromosome().size(), os_it); std::copy(annotations.chromosome().begin(), annotations.chromosome().end(), os_it); - varint_encode(annotations.locus(), os_it); + varint_encode(annotations.position(), os_it); varint_encode(annotations.ref().size(), os_it); if (annotations.ref().size()) diff --git a/include/savvy/site_info.hpp b/include/savvy/site_info.hpp index 7b29200..ac581f1 100644 --- a/include/savvy/site_info.hpp +++ b/include/savvy/site_info.hpp @@ -23,13 +23,13 @@ namespace savvy site_info( std::string&& chromosome, - std::uint64_t locus, + std::uint64_t pos, std::string&& ref, std::string&& alt, std::unordered_map&& properties) : chromosome_(std::move(chromosome)), - locus_(locus), + position_(pos), ref_(std::move(ref)), alt_(std::move(alt)), properties_(std::move(properties)) @@ -42,7 +42,8 @@ namespace savvy const std::string& chromosome() const { return chromosome_; } const std::string& ref() const { return ref_; } const std::string& alt() const { return alt_; } - std::uint64_t locus() const { return locus_; } + [[deprecated]] std::uint64_t locus() const { return position_; } + std::uint64_t position() const { return position_; } const std::string& prop(const std::string& key) const { auto it = properties_.find(key); @@ -55,7 +56,7 @@ namespace savvy std::string ref_; std::string alt_; std::unordered_map properties_; - std::uint64_t locus_; + std::uint64_t position_; static const std::string empty_string; }; diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index 03b827c..3061bbb 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -1107,7 +1107,7 @@ namespace savvy else { (*output_stream_) << anno.chromosome() - << "\t" << anno.locus() + << "\t" << anno.position() << "\t" << std::string(anno.prop("ID").size() ? anno.prop("ID") : ".") << "\t" << anno.ref() << "\t" << anno.alt() diff --git a/src/bcf2m3vcf.cpp b/src/bcf2m3vcf.cpp index 230cac2..9a39e5d 100644 --- a/src/bcf2m3vcf.cpp +++ b/src/bcf2m3vcf.cpp @@ -22,11 +22,11 @@ int main(int argc, char** argv) savvy::m3vcf::block output_block(sample_ids.size(), 2); while (input.good()) { - if (!output_block.add_marker(variant.locus(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end())) + if (!output_block.add_marker(variant.position(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end())) { compact_output << output_block; output_block = savvy::m3vcf::block(sample_ids.size(), 2); - output_block.add_marker(variant.locus(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end()); + output_block.add_marker(variant.position(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end()); } input.read(variant, genotypes); diff --git a/src/sav_reader.cpp b/src/sav_reader.cpp index 5045037..8a502f6 100644 --- a/src/sav_reader.cpp +++ b/src/sav_reader.cpp @@ -56,8 +56,8 @@ namespace savvy ++records_in_block; current_chromosome = variant.chromosome(); - min = std::min(min, std::uint32_t(variant.locus())); - max = std::max(max, std::uint32_t(variant.locus() + std::max(variant.ref().size(), variant.alt().size()) - 1)); + min = std::min(min, std::uint32_t(variant.position())); + max = std::max(max, std::uint32_t(variant.position() + std::max(variant.ref().size(), variant.alt().size()) - 1)); std::int64_t end_pos = r.tellg(); if (start_pos != end_pos) // zstd frame frame boundary diff --git a/src/site_info.cpp b/src/site_info.cpp index b8ce639..12eab04 100644 --- a/src/site_info.cpp +++ b/src/site_info.cpp @@ -8,7 +8,7 @@ namespace savvy void print_vcf_site_info(std::ostream& out, const site_info& in, const std::vector& info_fields) { out << in.chromosome() << "\t" - << in.locus() << "\t" + << in.position() << "\t" << (in.prop("ID").size() ? in.prop("ID") : ".") << "\t" << in.ref() << "\t" << in.alt() << "\t" diff --git a/src/test.cpp b/src/test.cpp index 662e343..211d786 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -468,7 +468,7 @@ class file_checksum_test std::size_t num_markers = 0; while (reader >> variant) { - ret = hash_combine(ret, anno.locus()); + ret = hash_combine(ret, anno.position()); ret = hash_combine(ret, anno.ref()); ret = hash_combine(ret, anno.alt()); @@ -676,7 +676,7 @@ void random_access_test() while (rdr.read(anno, b)) { - std::cout << anno.chromosome() << " " << anno.locus() << " " << anno.ref() << " " << anno.alt() << std::endl; + std::cout << anno.chromosome() << " " << anno.position() << " " << anno.ref() << " " << anno.alt() << std::endl; } std::cout << "--------------------------------" << std::endl; @@ -685,7 +685,7 @@ void random_access_test() std::vector v; while (rdr.read(anno, v)) { - std::cout << anno.chromosome() << " " << anno.locus() << " " << anno.ref() << " " << anno.alt() << std::endl; + std::cout << anno.chromosome() << " " << anno.position() << " " << anno.ref() << " " << anno.alt() << std::endl; } } From e2fc21eb22131fc391bd577f2f2bcab476c8da25 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Thu, 26 Oct 2017 15:14:39 -0400 Subject: [PATCH 03/86] Adds initial HDS support. --- include/savvy/data_format.hpp | 1 + include/savvy/sav_reader.hpp | 137 ++++++++++++++++++++++++++++++---- include/savvy/vcf_reader.hpp | 46 ++++++++++++ src/sav2vcf.cpp | 2 + src/vcf2sav.cpp | 6 +- 5 files changed, 177 insertions(+), 15 deletions(-) diff --git a/include/savvy/data_format.hpp b/include/savvy/data_format.hpp index 7139c7a..728ee71 100644 --- a/include/savvy/data_format.hpp +++ b/include/savvy/data_format.hpp @@ -14,6 +14,7 @@ namespace savvy genotype_likelihood, phred_scaled_genotype_likelihood, dosage, + haplotype_dosage, //phase, // gt = genotype, // gp = genotype_probability, diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 14709eb..14d9c0b 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -343,8 +343,8 @@ namespace savvy std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - std::uint64_t next_ref_value_offset = 0; - std::uint64_t last_stride_offset = 0; + //std::uint64_t next_ref_value_offset = 0; + //std::uint64_t last_stride_offset = 0; for (std::size_t i = 0; i < sample_count(); ++i) { @@ -372,14 +372,14 @@ namespace savvy } template - void read_genotypes_ds(T& destination) + void read_genotypes_hds(T& destination) { - if (file_data_format_ != fmt::genotype_probability) + if (file_data_format_ != fmt::haplotype_dosage) input_stream_.setstate(std::ios::failbit); if (good()) { - const typename T::value_type alt_value{1}; + const typename T::value_type alt_value = typename T::value_type(1); std::istreambuf_iterator in_it(input_stream_); std::istreambuf_iterator end_it; @@ -390,19 +390,79 @@ namespace savvy } else { - destination.resize(sample_count()); + destination.resize(sample_count() * ploidy_level); std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - std::uint64_t ploidy_counter = 0; + //std::uint64_t next_ref_value_offset = 0; + //std::uint64_t last_stride_offset = 0; + + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) { typename T::value_type allele; std::uint64_t offset; std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; - destination[total_offset / ploidy_level] += (allele * ((total_offset % ploidy_level) + 1)); + + assert(total_offset < destination.size()); + destination[total_offset] = allele; + } + + input_stream_.get(); + } + } + } + + template + void read_genotypes_ds(T& destination) + { + if (file_data_format_ != fmt::genotype_probability && file_data_format_ != fmt::haplotype_dosage) + input_stream_.setstate(std::ios::failbit); + + if (good()) + { + const typename T::value_type alt_value{1}; + std::istreambuf_iterator in_it(input_stream_); + std::istreambuf_iterator end_it; + + std::uint64_t ploidy_level; + if (varint_decode(in_it, end_it, ploidy_level) == end_it) + { + this->input_stream_.setstate(std::ios::badbit); + } + else + { + destination.resize(sample_count()); + + std::uint64_t sz; + varint_decode(++in_it, end_it, sz); + std::uint64_t total_offset = 0; + //std::uint64_t ploidy_counter = 0; + + if (file_data_format_ == fmt::genotype_probability) + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; + destination[total_offset / ploidy_level] += (allele * ((total_offset % ploidy_level) + 1)); + } + } + else // fmt::haplotype_dosage + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; + destination[total_offset / ploidy_level] += allele; + } } input_stream_.get(); @@ -421,17 +481,16 @@ namespace savvy read_genotypes_gt(destination); else if (requested_data_formats_[idx] == fmt::genotype_probability && file_data_format_ == fmt::genotype_probability) read_genotypes_gp(destination); - else if (requested_data_formats_[idx] == fmt::dosage && file_data_format_ == fmt::genotype_probability) + else if (requested_data_formats_[idx] == fmt::dosage && (file_data_format_ == fmt::genotype_probability || file_data_format_ == fmt::haplotype_dosage)) read_genotypes_ds(destination); + else if (requested_data_formats_[idx] == fmt::haplotype_dosage && file_data_format_ == fmt::haplotype_dosage) + read_genotypes_hds(destination); else input_stream_.setstate(std::ios::failbit); } else { - if (file_data_format_ == fmt::allele) - discard_genotypes(); - else - discard_genotypes(); + discard_genotypes(); } } @@ -690,7 +749,14 @@ namespace savvy headers_.resize(std::distance(headers_.begin(), copy_res)); // TODO: Handle unsupported formats. - headers_.push_back(std::make_pair("FORMAT", data_format_ == fmt::genotype_probability ? "" : "")); + const char* fmt_str; + if (data_format_ == fmt::genotype_probability) + fmt_str = ""; + else if (data_format_ == fmt::haplotype_dosage) + fmt_str = ""; + else + fmt_str = ""; + headers_.push_back(std::make_pair("FORMAT", fmt_str)); varint_encode(headers_.size(), out_it); for (auto it = headers_.begin(); it != headers_.end(); ++it) @@ -848,6 +914,10 @@ namespace savvy { write_probs(data); } + else if (data_format_ == fmt::haplotype_dosage) + { + write_hap_dosages(data); + } else { write_alleles(data); @@ -962,6 +1032,43 @@ namespace savvy } } } + + template + void write_hap_dosages(const std::vector& m) + { + const T ref_value = T(); + + std::ostreambuf_iterator os_it(output_stream_.rdbuf()); + + std::uint32_t ploidy = std::uint32_t((m.size() / sample_size_) & 0xFFFFFFFF); + + // TODO: check modulus and set error if needed. + varint_encode(ploidy, os_it); + + auto beg = m.begin(); + std::uint64_t non_zero_count = 0; + for (auto it = m.begin(); it != m.end(); ++it) + { + if (allele_encoder<7>::encode(*it) >= 0) + ++non_zero_count; + } + + + allele_count_ += non_zero_count; + varint_encode(non_zero_count, os_it); + std::uint64_t last_pos = 0; + for (auto it = beg; it != m.end(); ++it) + { + std::int8_t signed_allele = allele_encoder<7>::encode(*it); + if (signed_allele >= 0) + { + std::uint64_t dist = static_cast(std::distance(beg, it)); + std::uint64_t offset = dist - last_pos; + last_pos = dist + 1; + prefixed_varint<7>::encode((std::uint8_t)(signed_allele), offset, os_it); + } + } + } private: static const std::array empty_string_array; static const std::array, 0> empty_string_pair_array; @@ -1046,6 +1153,8 @@ namespace savvy file_data_format_ = fmt::allele; else if (format_field == "GP") file_data_format_ = fmt::genotype_probability; + else if (format_field == "HDS") + file_data_format_ = fmt::haplotype_dosage; } headers_.emplace_back(std::move(key), std::move(val)); } diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index 3061bbb..d96d547 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -116,6 +116,8 @@ namespace savvy template void read_genotypes_ds(T& destination); template + void read_genotypes_hds(T& destination); + template void read_genotypes_gl(T& destination); template void read_genotypes_pl(T& destination); @@ -421,6 +423,10 @@ namespace savvy { read_genos_to<0>(fmt::dosage, destinations...); } + else if (fmt_key == "HDS") + { + read_genos_to<0>(fmt::haplotype_dosage, destinations...); + } else if (fmt_key == "GP") { read_genos_to<0>(fmt::genotype_probability, destinations...); @@ -460,6 +466,9 @@ namespace savvy case fmt::dosage: read_genotypes_ds(destination); break; + case fmt::haplotype_dosage: + read_genotypes_hds(destination); + break; case fmt::genotype_likelihood: read_genotypes_gl(destination); break; @@ -675,6 +684,43 @@ namespace savvy } } + template + template + void reader_base::read_genotypes_hds(T& destination) + { + if (good()) + { + if (hts_rec()->n_allele > 2) + { + state_ = std::ios::failbit; // multi allelic HDS not supported. + return; + } + + if (bcf_get_format_float(hts_hdr(),hts_rec(),"HDS", &(gt_), &(gt_sz_)) >= 0) + { + int num_samples = hts_hdr()->n[BCF_DT_SAMPLE]; + if (gt_sz_ % num_samples != 0) + { + // TODO: mixed ploidy at site error. + } + else + { + const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample); + destination.resize(gt_sz_); + + float* hds = (float*)(void*)(gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = hds[i]; + } + return; + } + } + + this->state_ = std::ios::failbit; + } + } + template template void reader_base::read_genotypes_gp(T& destination) diff --git a/src/sav2vcf.cpp b/src/sav2vcf.cpp index e2b5b8b..ae8e6f0 100644 --- a/src/sav2vcf.cpp +++ b/src/sav2vcf.cpp @@ -151,6 +151,8 @@ int main(int argc, char** argv) { if (header_id == "GT") it->second = ""; + else if (header_id == "HDS") + it->second = ""; else if (header_id == "GP") it->second = ""; // TODO: Handle other ploidy levels. } diff --git a/src/vcf2sav.cpp b/src/vcf2sav.cpp index 38e9820..e1063bf 100644 --- a/src/vcf2sav.cpp +++ b/src/vcf2sav.cpp @@ -51,7 +51,7 @@ class prog_args os << "\n"; os << " -# : # compression level (1-19, default: " << default_compression_level << ")\n"; os << " -b, --block-size : Number of markers in compression block (0-65535, default: " << default_block_size << ")\n"; - os << " -f, --format : Format field to copy (GT or GP, default: GT)\n"; + os << " -f, --format : Format field to copy (GT, HDS or GP, default: GT)\n"; os << " -h, --help : Print usage\n"; os << " -v, --version : Print version\n"; os << "----------------------------------------------\n"; @@ -91,6 +91,10 @@ class prog_args { format_ = savvy::fmt::genotype_probability; } + else if (str_opt_arg == "HDS") + { + format_ = savvy::fmt::haplotype_dosage; + } else if (str_opt_arg != "GT") { std::cerr << "Invalid format field value (" << str_opt_arg << ")\n"; From 79127f8729cb4944fb82170e041f399551a15005 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 10:55:15 -0400 Subject: [PATCH 04/86] Adds test file for haplotype dosages. --- test_file_hds.vcf | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 test_file_hds.vcf diff --git a/test_file_hds.vcf b/test_file_hds.vcf new file mode 100644 index 0000000..7376c55 --- /dev/null +++ b/test_file_hds.vcf @@ -0,0 +1,33 @@ +##fileformat=VCFv4.2 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##contig= +##phasing=all +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 NA00005 NA00006 +18 2234668 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.75 0.0,0.0 0.0,0.0 0.0,0.1 +18 2234679 . G T 29 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.70 0.0,0.0 0.0,0.0 0.0,0.1 +18 2234687 . G A 29 PASS . HDS 0.5,0.5 0.0,0.0 0.0,0.25 0.0,0.0 0.0,0.0 0.0,0.1 +18 2234697 . T A 29 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.15 0.0,0.0 0.0,0.0 0.9,0.1 +18 2234767 . T G 29 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.12 0.0,0.0 0.9,0.0 0.0,0.1 +20 14360 . G AC 29 PASS . HDS 0.5,0.5 0.0,0.0 0.0,0.21 0.0,0.0 0.0,0.0 0.3,0.1 +20 14370 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.75 0.0,0.0 0.0,0.0 0.0,0.1 +20 17330 . T A 3 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.91 0.0,0.0 0.0,0.0 0.2,0.1 +20 1110696 . A G 67 PASS . HDS 0.5,0.5 1.0,0.0 0.0,0.99 0.0,0.0 0.4,0.0 0.5,0.1 +20 1230237 . T . 47 PASS . HDS 0.5,0.5 0.0,0.0 0.0,0.47 0.0,0.0 0.0,0.0 0.0,0.1 +20 1234567 . GTC TCAAA 50 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 1234667 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 1234767 . T A 3 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 2230237 . T . 47 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 2234567 . GTC A 50 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 2234567 . GTC C 50 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 2234667 . G A 29 PASS . HDS 0.5,0.5 0.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 2234767 . T A 3 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 1.0,0.0 0.0,0.1 +20 3234567 . GTC TTT 50 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 3234668 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,1.0 0.0,0.0 0.0,0.1 +20 3234679 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 .,. 1.0,0.0 0.0,0.0 0.0,0.1 +20 3234687 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 .,. 0.0,0.0 0.0,0.0 0.0,0.1 +20 3234697 . G A 29 PASS . HDS 0.5,0.5 0.0,0.0 1.0,0.772 0.0,0.0 0.0,0.0 0.0,0.1 +20 3234767 . G A 29 PASS . HDS 0.5,0.5 1.0,0.0 1.0,0.223 0.0,0.0 0.0,0.0 0.0,0.1 From 1d9ebbfeaa894385511f5884ee0ad62e0361e731 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 12:25:05 -0400 Subject: [PATCH 05/86] Fixes library version retrieval. --- include/savvy/utility.hpp | 2 +- src/sav2vcf.cpp | 2 +- src/savvy_speed_test.cpp | 2 +- src/utility.cpp | 5 +++++ src/vcf2sav.cpp | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/savvy/utility.hpp b/include/savvy/utility.hpp index ddfb127..4c1033c 100644 --- a/include/savvy/utility.hpp +++ b/include/savvy/utility.hpp @@ -13,7 +13,7 @@ namespace savvy // return std::unique_ptr(new T(std::forward(args)...)); // } - static const std::string version = SAVVY_VERSION; + std::string savvy_version(); std::string parse_header_id(std::string header_value); diff --git a/src/sav2vcf.cpp b/src/sav2vcf.cpp index ae8e6f0..d9e4907 100644 --- a/src/sav2vcf.cpp +++ b/src/sav2vcf.cpp @@ -126,7 +126,7 @@ int main(int argc, char** argv) if (args.version_is_set()) { - std::cout << "sav2vcf v" << savvy::version << std::endl; + std::cout << "sav2vcf v" << savvy::savvy_version() << std::endl; return EXIT_SUCCESS; } diff --git a/src/savvy_speed_test.cpp b/src/savvy_speed_test.cpp index 5752193..7b09f42 100644 --- a/src/savvy_speed_test.cpp +++ b/src/savvy_speed_test.cpp @@ -126,7 +126,7 @@ int main(int argc, char** argv) if (args.version_is_set()) { - std::cout << "Savvy Speed Test v" << savvy::version << std::endl; + std::cout << "Savvy Speed Test v" << savvy::savvy_version() << std::endl; return EXIT_SUCCESS; } diff --git a/src/utility.cpp b/src/utility.cpp index d3ddfde..012344e 100644 --- a/src/utility.cpp +++ b/src/utility.cpp @@ -5,6 +5,11 @@ namespace savvy { + std::string savvy_version() + { + return std::string(SAVVY_VERSION); + } + std::string parse_header_id(std::string header_value) { if (header_value.size()) diff --git a/src/vcf2sav.cpp b/src/vcf2sav.cpp index e1063bf..74e2f30 100644 --- a/src/vcf2sav.cpp +++ b/src/vcf2sav.cpp @@ -162,7 +162,7 @@ int main(int argc, char** argv) if (args.version_is_set()) { - std::cout << "vcf2sav v" << savvy::version << std::endl; + std::cout << "vcf2sav v" << savvy::savvy_version() << std::endl; return EXIT_SUCCESS; } From 0018a49dea6e65f91935b72fba6a9417534cb72e Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 14:51:01 -0400 Subject: [PATCH 06/86] Removes extraction operator to support older compilers. --- include/savvy/reader.hpp | 7 ++++--- include/savvy/utility.hpp | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/savvy/reader.hpp b/include/savvy/reader.hpp index 2914aee..cc8ddfa 100644 --- a/include/savvy/reader.hpp +++ b/include/savvy/reader.hpp @@ -162,10 +162,10 @@ namespace savvy template reader(const std::string& file_path, T... data_formats); ~reader() {} - +#if __cpp_decltype_auto >= 201304 template reader& operator>>(std::tuple& destination); - +#endif template reader& read(site_info& annotations, T&... destinations); private: @@ -274,6 +274,7 @@ namespace savvy //################################################################// //################################################################// +#if __cpp_decltype_auto >= 201304 template template reader& reader::operator>>(std::tuple& destination) @@ -285,7 +286,7 @@ namespace savvy destination); return *this; } - +#endif template template reader& reader::read(site_info& annotations, T&... destinations) diff --git a/include/savvy/utility.hpp b/include/savvy/utility.hpp index 4c1033c..82f64cc 100644 --- a/include/savvy/utility.hpp +++ b/include/savvy/utility.hpp @@ -19,6 +19,7 @@ namespace savvy namespace detail { +#if __cpp_decltype_auto >= 201304 template decltype(auto) apply_impl(F&& fn, Tuple&& t, std::index_sequence) { @@ -34,6 +35,7 @@ namespace savvy std::forward(t), std::make_index_sequence()); } +#endif } } From 7f445def5533986dbdad43fde211150bc736d4c9 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 14:53:58 -0400 Subject: [PATCH 07/86] Removes extraction operator to support older compilers. --- include/savvy/sav_reader.hpp | 4 ++-- include/savvy/vcf_reader.hpp | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 14d9c0b..38e762c 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -525,7 +525,7 @@ namespace savvy { public: using reader_base::reader_base; - +#if __cpp_decltype_auto >= 201304 template reader& operator>>(std::tuple& destination) { @@ -536,7 +536,7 @@ namespace savvy destination); return *this; } - +#endif template reader& read(site_info& annotations, T&... destinations) { diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index d96d547..50e4772 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -224,10 +224,10 @@ namespace savvy void reset_region(const region& reg); std::vector chromosomes() const; - +#if __cpp_decltype_auto >= 201304 template indexed_reader& operator>>(std::tuple& destination); - +#endif template indexed_reader& read(site_info& annotations, T&... destinations); @@ -919,6 +919,7 @@ namespace savvy //################################################################// //################################################################// +#if __cpp_decltype_auto >= 201304 template template indexed_reader& indexed_reader::operator>>(std::tuple& destination) @@ -930,7 +931,7 @@ namespace savvy destination); return *this; } - +#endif template template indexed_reader& indexed_reader::read(site_info& annotations, T&... destinations) From accd355dbca75688b2568d985141bbb35bc17f23 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 14:56:10 -0400 Subject: [PATCH 08/86] Removes extraction operator to support older compilers. --- include/savvy/reader.hpp | 8 ++++---- include/savvy/sav_reader.hpp | 4 ++-- include/savvy/vcf_reader.hpp | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/savvy/reader.hpp b/include/savvy/reader.hpp index cc8ddfa..9415200 100644 --- a/include/savvy/reader.hpp +++ b/include/savvy/reader.hpp @@ -195,10 +195,10 @@ namespace savvy void reset_region(const region& reg); std::vector chromosomes() const; - +#if __cpp_decltype_auto >= 201304 template indexed_reader& operator>>(std::tuple& destination); - +#endif template indexed_reader& read(site_info& annotations, T&... destinations); @@ -348,7 +348,7 @@ namespace savvy else if (vcf_reader_) vcf_reader_->reset_region(reg); } - +#if __cpp_decltype_auto >= 201304 template template indexed_reader& indexed_reader::operator>>(std::tuple& destination) @@ -360,7 +360,7 @@ namespace savvy destination); return *this; } - +#endif template template indexed_reader& indexed_reader::read(site_info& annotations, T&... destinations) diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 38e762c..cf6c016 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -590,7 +590,7 @@ namespace savvy { return index_.tree_names(); } - +#if __cpp_decltype_auto >= 201304 template indexed_reader& operator>>(std::tuple& destination) { @@ -601,7 +601,7 @@ namespace savvy destination); return *this; } - +#endif template indexed_reader& read(site_info& annotations, T&... destinations) { diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index 50e4772..f9e4cfe 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -147,7 +147,7 @@ namespace savvy reader(const reader&) = delete; reader& operator=(const reader&) = delete; ~reader(); - +#if __cpp_decltype_auto >= 201304 template reader& operator>>(std::tuple& destination) { @@ -158,7 +158,7 @@ namespace savvy destination); return *this; } - +#endif template reader& read(site_info& annotations, T&... destinations); From 46cef89eb5771234dec075f934a8eb0e421157e8 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 15:08:25 -0400 Subject: [PATCH 09/86] Removes extraction operator to support older compilers. --- src/test.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/test.cpp b/src/test.cpp index 211d786..ede2f93 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -459,14 +459,13 @@ class file_checksum_test { std::size_t ret = 0; - std::tuple> variant; - auto& anno = std::get<0>(variant); - auto& data = std::get<1>(variant); + savvy::site_info anno; + std::vector data; auto prop_fields = reader.prop_fields(); std::size_t num_markers = 0; - while (reader >> variant) + while (reader.read(anno, data)) { ret = hash_combine(ret, anno.position()); ret = hash_combine(ret, anno.ref()); From c72c3fe6ef4386cc4d7f77238647c51ea5eef398 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 27 Oct 2017 15:13:54 -0400 Subject: [PATCH 10/86] Removes auto usage from lambda to support old compilers. --- src/test.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/test.cpp b/src/test.cpp index ede2f93..32065f9 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -666,12 +666,9 @@ void random_access_test() { savvy::sav::writer::create_index("test_file.sav"); - savvy::indexed_reader<1> rdr("", {"", 0, 0}, savvy::fmt::allele); - savvy::indexed_reader<1> tmp("test_file.sav", {"20", 17000, 1120000}, savvy::fmt::allele); + savvy::indexed_reader<1> rdr("test_file.sav", {"20", 17000, 1120000}, savvy::fmt::allele); savvy::site_info anno; std::vector b; - rdr.read_if([](const auto& m) { return true; }, anno, b); - rdr = std::move(tmp); while (rdr.read(anno, b)) { From 4373b1a2c24d5ad7a315686eb1a86d476b6c0637 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Wed, 8 Nov 2017 17:09:56 -0500 Subject: [PATCH 11/86] Removes c++14 requirement. --- CMakeLists.txt | 2 +- include/savvy/compressed_vector.hpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7dcebc6..d82a120 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.2) project(savvy VERSION 1.0.0) include(CMakePackageConfigHelpers) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) #add_library(hts STATIC IMPORTED) #set_property(TARGET hts PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/libhts.a) diff --git a/include/savvy/compressed_vector.hpp b/include/savvy/compressed_vector.hpp index 1baaa42..bcf6a0d 100644 --- a/include/savvy/compressed_vector.hpp +++ b/include/savvy/compressed_vector.hpp @@ -40,11 +40,11 @@ namespace savvy } } - auto begin() const { return this->values_.cbegin(); } - auto end() const { return this->values_.cend(); } + typename std::vector::const_iterator begin() const { return this->values_.cbegin(); } + typename std::vector::const_iterator end() const { return this->values_.cend(); } - auto begin() { return this->values_.begin(); } - auto end() { return this->values_.end(); } + typename std::vector::iterator begin() { return this->values_.begin(); } + typename std::vector::iterator end() { return this->values_.end(); } const value_type& operator[](std::size_t pos) const { From 7fc474c31e8ee46dad5cbe7e10b5d4eeab6940d0 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Wed, 8 Nov 2017 17:36:05 -0500 Subject: [PATCH 12/86] Fixes shared lib building of htslib. --- dep/htslib.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dep/htslib.cmake b/dep/htslib.cmake index 2abcd70..844782e 100644 --- a/dep/htslib.cmake +++ b/dep/htslib.cmake @@ -10,8 +10,12 @@ install(DIRECTORY htslib DESTINATION include) if (BUILD_SHARED_LIBS) install(FILES ${CMAKE_SHARED_LIBRARY_PREFIX}hts${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_SHARED_LIBRARY_PREFIX}hts.1${CMAKE_SHARED_LIBRARY_SUFFIX} DESTINATION lib) + install(FILES + ${CMAKE_SHARED_LIBRARY_PREFIX}hts.1${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_SHARED_LIBRARY_PREFIX}hts${CMAKE_SHARED_LIBRARY_SUFFIX}.1 + DESTINATION lib + OPTIONAL) else() install(FILES ${CMAKE_STATIC_LIBRARY_PREFIX}hts${CMAKE_STATIC_LIBRARY_SUFFIX} From 1cebb0256d4a89c4075bb89d878a4cd49dd4a394 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 10 Nov 2017 17:37:20 -0500 Subject: [PATCH 13/86] Removes locus member function from site_info. --- include/savvy/site_info.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/savvy/site_info.hpp b/include/savvy/site_info.hpp index ac581f1..a8163fe 100644 --- a/include/savvy/site_info.hpp +++ b/include/savvy/site_info.hpp @@ -42,7 +42,7 @@ namespace savvy const std::string& chromosome() const { return chromosome_; } const std::string& ref() const { return ref_; } const std::string& alt() const { return alt_; } - [[deprecated]] std::uint64_t locus() const { return position_; } + //[[deprecated]] std::uint64_t locus() const { return position_; } std::uint64_t position() const { return position_; } const std::string& prop(const std::string& key) const { From e0d1bbddfb774d1f24aea7946dbe8e97cd8c1736 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Tue, 14 Nov 2017 15:26:44 -0500 Subject: [PATCH 14/86] Changes branch on shrinkwrap dependency. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e707d7f..ab3c903 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -jonathonl/shrinkwrap@v1.0.0-alpha3 +jonathonl/shrinkwrap@develop htslib,https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 --hash md5:16d78f90b72f29971b042e8da8be6843 --cmake dep/htslib.cmake \ No newline at end of file From f7f58307a391695365f73afec83fb7c63dd71595 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Tue, 14 Nov 2017 16:02:18 -0500 Subject: [PATCH 15/86] Allows for shared zstd linking. --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d82a120..8f2fdbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,7 @@ endif() #find_package(shrinkwrap CONFIG REQUIRED) find_library(HTS_LIBRARY hts) find_library(ZLIB_LIBRARY z) -#ZSTD can only likn statically since experimental functions are being used. -find_library(ZSTD_LIBRARY NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}) +find_library(ZSTD_LIBRARY zstd) find_package(Threads) #get_target_property(SHRINKWRAP_LIBS shrinkwrap INTERFACE_LINK_LIBRARIES) From 12463a3334602a46c932e6807e63f96802d486da Mon Sep 17 00:00:00 2001 From: jonathonl Date: Wed, 15 Nov 2017 12:24:56 -0500 Subject: [PATCH 16/86] Fixes crash during region query. --- include/savvy/vcf_reader.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index f9e4cfe..952c9d2 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -938,7 +938,7 @@ namespace savvy { while (this->good()) { - this->read_variant(annotations, destinations...); + this->read_variant_details(annotations); if (this->good() && region_compare(bounding_type_, annotations, region_)) { this->read_requested_genos(destinations...); @@ -992,8 +992,8 @@ namespace savvy if (reg.from() > 1 || reg.to() != std::numeric_limits::max()) contigs << ":" << reg.from() << "-" << reg.to(); - bcf_sr_set_regions(synced_readers_, contigs.str().c_str(), 0); - if (bcf_sr_add_reader(synced_readers_, file_path_.c_str())) + + if (bcf_sr_set_regions(synced_readers_, contigs.str().c_str(), 0) == 0 && bcf_sr_add_reader(synced_readers_, file_path_.c_str()) == 1) this->init_property_fields(); else this->state_ = std::ios::badbit; From 635e9ded28a4df9f7b462bffdf6c86efb480fe39 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Thu, 16 Nov 2017 16:29:36 -0500 Subject: [PATCH 17/86] Adds subset_samples to SAV readers. --- include/savvy/sav_reader.hpp | 332 +++++++++++++++++++++++++---------- 1 file changed, 236 insertions(+), 96 deletions(-) diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index cf6c016..a4861b5 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace savvy { @@ -92,6 +93,14 @@ namespace savvy std::vector prop_fields() const { return std::vector(metadata_fields_); } std::vector> headers() const { return headers_; } + /** + * + * @param subset IDs to include if they exist in file. + * @return intersect of subset and samples IDs in file. + */ + std::vector subset_samples(const std::set& subset); + std::vector subset_samples(const std::vector& subset); + const std::string& file_path() const { return file_path_; } std::streampos tellg() { return this->input_stream_.tellg(); } protected: @@ -260,57 +269,38 @@ namespace savvy } else { - destination.resize(sample_count() * ploidy_level); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) - { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset] = allele; //(allele ? missing_value : alt_value); - } - - input_stream_.get(); - } - } - } - template - void read_genotypes_gt(T& destination) - { - if (file_data_format_ != fmt::allele) - input_stream_.setstate(std::ios::failbit); - - if (good()) - { - const typename T::value_type alt_value{1}; - std::istreambuf_iterator in_it(input_stream_); - std::istreambuf_iterator end_it; + if (subset_map_.size()) + { + destination.resize(subset_size_ * ploidy_level); - std::uint64_t ploidy_level; - if (varint_decode(in_it, end_it, ploidy_level) == end_it) - { - this->input_stream_.setstate(std::ios::badbit); - } - else - { - destination.resize(sample_count()); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; - std::uint64_t sz; - varint_decode(++in_it, end_it, sz); - std::uint64_t total_offset = 0; - std::uint64_t ploidy_counter = 0; - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (total_offset % ploidy_level)] = allele; //(allele ? missing_value : alt_value); + } + } + else { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset / ploidy_level] += allele; //(allele ? missing_value : alt_value); + destination.resize(sample_count() * ploidy_level); + + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; + destination[total_offset] = allele; //(allele ? missing_value : alt_value); + } } input_stream_.get(); @@ -319,14 +309,14 @@ namespace savvy } template - void read_genotypes_gp(T& destination) + void read_genotypes_gt(T& destination) { - if (file_data_format_ != fmt::genotype_probability) + if (file_data_format_ != fmt::allele) input_stream_.setstate(std::ios::failbit); if (good()) { - const typename T::value_type alt_value = typename T::value_type(1); + const typename T::value_type alt_value{1}; std::istreambuf_iterator in_it(input_stream_); std::istreambuf_iterator end_it; @@ -337,33 +327,38 @@ namespace savvy } else { - std::size_t stride = ploidy_level + 1; - destination.resize(sample_count() * stride); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - //std::uint64_t next_ref_value_offset = 0; - //std::uint64_t last_stride_offset = 0; - for (std::size_t i = 0; i < sample_count(); ++i) + if (subset_map_.size()) { - assert(i < destination.size()); - destination[i * stride] = typename T::value_type(1); - } + destination.resize(subset_size_); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += allele; //(allele ? missing_value : alt_value); + } + } + else { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + destination.resize(sample_count()); - total_offset += offset; - - assert(total_offset < destination.size()); - destination[total_offset] = allele; - destination[(total_offset / stride) * stride] -= allele; + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + total_offset += offset; + destination[total_offset / ploidy_level] += allele; //(allele ? missing_value : alt_value); + } } input_stream_.get(); @@ -371,6 +366,59 @@ namespace savvy } } +// template +// void read_genotypes_gp(T& destination) +// { +// if (file_data_format_ != fmt::genotype_probability) +// input_stream_.setstate(std::ios::failbit); +// +// if (good()) +// { +// const typename T::value_type alt_value = typename T::value_type(1); +// std::istreambuf_iterator in_it(input_stream_); +// std::istreambuf_iterator end_it; +// +// std::uint64_t ploidy_level; +// if (varint_decode(in_it, end_it, ploidy_level) == end_it) +// { +// this->input_stream_.setstate(std::ios::badbit); +// } +// else +// { +// std::size_t stride = ploidy_level + 1; +// destination.resize(sample_count() * stride); +// +// std::uint64_t sz; +// varint_decode(++in_it, end_it, sz); +// std::uint64_t total_offset = 0; +// //std::uint64_t next_ref_value_offset = 0; +// //std::uint64_t last_stride_offset = 0; +// +// for (std::size_t i = 0; i < sample_count(); ++i) +// { +// assert(i < destination.size()); +// destination[i * stride] = typename T::value_type(1); +// } +// +// +// for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) +// { +// typename T::value_type allele; +// std::uint64_t offset; +// std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); +// +// total_offset += offset; +// +// assert(total_offset < destination.size()); +// destination[total_offset] = allele; +// destination[(total_offset / stride) * stride] -= allele; +// } +// +// input_stream_.get(); +// } +// } +// } + template void read_genotypes_hds(T& destination) { @@ -390,25 +438,43 @@ namespace savvy } else { - destination.resize(sample_count() * ploidy_level); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - //std::uint64_t next_ref_value_offset = 0; - //std::uint64_t last_stride_offset = 0; + if (subset_map_.size()) + { + destination.resize(subset_size_ * ploidy_level); - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (total_offset % ploidy_level)] = allele; + } + } + else { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + destination.resize(sample_count() * ploidy_level); - total_offset += offset; + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - assert(total_offset < destination.size()); - destination[total_offset] = allele; + total_offset += offset; + + assert(total_offset < destination.size()); + destination[total_offset] = allele; + } } input_stream_.get(); @@ -419,7 +485,7 @@ namespace savvy template void read_genotypes_ds(T& destination) { - if (file_data_format_ != fmt::genotype_probability && file_data_format_ != fmt::haplotype_dosage) + if (file_data_format_ != fmt::haplotype_dosage) input_stream_.setstate(std::ios::failbit); if (good()) @@ -435,26 +501,30 @@ namespace savvy } else { - destination.resize(sample_count()); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - //std::uint64_t ploidy_counter = 0; - if (file_data_format_ == fmt::genotype_probability) + if (subset_map_.size()) { + destination.resize(subset_size_); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) { typename T::value_type allele; std::uint64_t offset; std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); total_offset += offset; - destination[total_offset / ploidy_level] += (allele * ((total_offset % ploidy_level) + 1)); + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += allele; } } - else // fmt::haplotype_dosage + else { + destination.resize(sample_count()); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) { typename T::value_type allele; @@ -464,6 +534,35 @@ namespace savvy destination[total_offset / ploidy_level] += allele; } } +// destination.resize(sample_count()); +// +// std::uint64_t sz; +// varint_decode(++in_it, end_it, sz); +// std::uint64_t total_offset = 0; +// //std::uint64_t ploidy_counter = 0; +// +// if (file_data_format_ == fmt::genotype_probability) +// { +// for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) +// { +// typename T::value_type allele; +// std::uint64_t offset; +// std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); +// total_offset += offset; +// destination[total_offset / ploidy_level] += (allele * ((total_offset % ploidy_level) + 1)); +// } +// } +// else // fmt::haplotype_dosage +// { +// for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) +// { +// typename T::value_type allele; +// std::uint64_t offset; +// std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); +// total_offset += offset; +// destination[total_offset / ploidy_level] += allele; +// } +// } input_stream_.get(); } @@ -479,9 +578,9 @@ namespace savvy read_genotypes_al(destination); else if (requested_data_formats_[idx] == fmt::genotype && file_data_format_ == fmt::allele) read_genotypes_gt(destination); - else if (requested_data_formats_[idx] == fmt::genotype_probability && file_data_format_ == fmt::genotype_probability) - read_genotypes_gp(destination); - else if (requested_data_formats_[idx] == fmt::dosage && (file_data_format_ == fmt::genotype_probability || file_data_format_ == fmt::haplotype_dosage)) +// else if (requested_data_formats_[idx] == fmt::genotype_probability && file_data_format_ == fmt::genotype_probability) +// read_genotypes_gp(destination); + else if (requested_data_formats_[idx] == fmt::dosage && file_data_format_ == fmt::haplotype_dosage) read_genotypes_ds(destination); else if (requested_data_formats_[idx] == fmt::haplotype_dosage && file_data_format_ == fmt::haplotype_dosage) read_genotypes_hds(destination); @@ -510,6 +609,8 @@ namespace savvy protected: std::vector sample_ids_; + std::vector subset_map_; + std::uint64_t subset_size_; fmt file_data_format_; std::array requested_data_formats_; shrinkwrap::zstd::istream input_stream_; @@ -750,10 +851,10 @@ namespace savvy // TODO: Handle unsupported formats. const char* fmt_str; - if (data_format_ == fmt::genotype_probability) - fmt_str = ""; - else if (data_format_ == fmt::haplotype_dosage) + if (data_format_ == fmt::haplotype_dosage) fmt_str = ""; +// else if (data_format_ == fmt::genotype_probability) +// fmt_str = ""; else fmt_str = ""; headers_.push_back(std::make_pair("FORMAT", fmt_str)); @@ -910,14 +1011,14 @@ namespace savvy std::copy(value.begin(), value.end(), os_it); } - if (data_format_ == fmt::genotype_probability) - { - write_probs(data); - } - else if (data_format_ == fmt::haplotype_dosage) + if (data_format_ == fmt::haplotype_dosage) { write_hap_dosages(data); } +// else if (data_format_ == fmt::genotype_probability) +// { +// write_probs(data); +// } else { write_alleles(data); @@ -1095,6 +1196,7 @@ namespace savvy template template reader_base::reader_base(const std::string& file_path, T... data_formats) : + subset_size_(0), input_stream_(file_path), file_path_(file_path), file_data_format_(fmt::allele) @@ -1151,8 +1253,8 @@ namespace savvy std::string format_field = parse_header_id(val); if (format_field == "GT") file_data_format_ = fmt::allele; - else if (format_field == "GP") - file_data_format_ = fmt::genotype_probability; +// else if (format_field == "GP") +// file_data_format_ = fmt::genotype_probability; else if (format_field == "HDS") file_data_format_ = fmt::haplotype_dosage; } @@ -1199,6 +1301,8 @@ namespace savvy template reader_base::reader_base(reader_base&& source) : sample_ids_(std::move(source.sample_ids_)), + subset_map_(std::move(source.subset_map_)), + subset_size_(source.subset_size_), //sbuf_(std::move(source.sbuf_)), //input_stream_(&sbuf_), input_stream_(std::move(source.input_stream_)), @@ -1215,6 +1319,8 @@ namespace savvy if (&source != this) { sample_ids_ = std::move(source.sample_ids_); + subset_map_ = std::move(source.subset_map_); + subset_size_ = source.subset_size_; //sbuf_ = std::move(source.sbuf_); //input_stream_.rdbuf(&sbuf_); input_stream_ = std::move(source.input_stream_); @@ -1227,6 +1333,40 @@ namespace savvy } #endif + template + std::vector reader_base::subset_samples(const std::set& subset) + { + std::vector ret; + ret.reserve(std::min(subset.size(), sample_ids_.size())); + + subset_map_.clear(); + subset_map_.resize(sample_ids_.size(), std::numeric_limits::max()); + std::uint64_t subset_index = 0; + for (auto it = sample_ids_.begin(); it != sample_ids_.end(); ++it) + { + if (subset.find(*it) != subset.end()) + { + subset_map_[std::distance(sample_ids_.begin(), it)] = subset_index; + ret.push_back(*it); + } + else + { + ++subset_index; + } + } + + subset_size_ = subset_index; + + return ret; + } + + template + std::vector reader_base::subset_samples(const std::vector& subset) + { + std::set unique(subset.begin(), subset.end()); + return subset_samples(unique); + } + template <> template inline std::tuple detail::allele_decoder<0>::decode(std::istreambuf_iterator& in_it, const std::istreambuf_iterator& end_it, const T& missing_value) From 3acb8cff4bb036fe9782404f8c9e94eb8d41a742 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Thu, 16 Nov 2017 17:24:27 -0500 Subject: [PATCH 18/86] Finishes sample subsetting. (Needs testing) --- include/savvy/reader.hpp | 23 ++++ include/savvy/vcf_reader.hpp | 245 ++++++++++++++++++++++++++++++----- 2 files changed, 235 insertions(+), 33 deletions(-) diff --git a/include/savvy/reader.hpp b/include/savvy/reader.hpp index 9415200..cb995b2 100644 --- a/include/savvy/reader.hpp +++ b/include/savvy/reader.hpp @@ -146,6 +146,9 @@ namespace savvy sample_iterator samples_begin() const; sample_iterator samples_end() const; std::size_t sample_size() const; + + std::vector subset_samples(const std::set& subset); + std::vector subset_samples(const std::vector& subset); protected: virtual savvy::sav::reader_base* sav_impl() const = 0; virtual savvy::vcf::reader_base* vcf_impl() const = 0; @@ -271,6 +274,26 @@ namespace savvy ret = static_cast(vcf_impl()->samples_end() - vcf_impl()->samples_begin()); return ret; } + + template + std::vector reader_base::subset_samples(const std::set& subset) + { + if (sav_impl()) + return sav_impl()->subset_samples(subset); + else if (vcf_impl()) + return vcf_impl()->subset_samples(subset); + return std::vector(); + } + + template + std::vector reader_base::subset_samples(const std::vector& subset) + { + if (sav_impl()) + return sav_impl()->subset_samples(subset); + else if (vcf_impl()) + return vcf_impl()->subset_samples(subset); + return std::vector(); + } //################################################################// //################################################################// diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index 952c9d2..fe95e5a 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -26,6 +26,8 @@ #include #include #include +#include +#include namespace savvy { @@ -45,6 +47,7 @@ namespace savvy { public: reader_base() : + subset_size_(0), state_(std::ios::goodbit), gt_(nullptr), gt_sz_(0), @@ -52,6 +55,8 @@ namespace savvy {} reader_base(reader_base&& source) : + subset_map_(std::move(source.subset_map_)), + subset_size_(source.subset_size_), state_(source.state_), gt_(source.gt_), gt_sz_(source.gt_sz_), @@ -79,6 +84,9 @@ namespace savvy const char** samples_begin() const; const char** samples_end() const; + std::vector subset_samples(const std::set& subset); + std::vector subset_samples(const std::vector& subset); + std::vector prop_fields() const; std::vector> headers() const; // std::vector::const_iterator prop_fields_begin() const { return property_fields_.begin(); } @@ -126,6 +134,8 @@ namespace savvy template void init_requested_formats(fmt f, T2... args); protected: + std::vector subset_map_; + std::uint64_t subset_size_; std::ios::iostate state_; int* gt_; int gt_sz_; @@ -320,6 +330,42 @@ namespace savvy return ret; } + template + std::vector reader_base::subset_samples(const std::set& subset) + { + std::vector ret; + const char** beg = hts_hdr() ? (const char**) hts_hdr()->samples : nullptr; + const char** end = hts_hdr() ? (const char**) (hts_hdr()->samples + bcf_hdr_nsamples(hts_hdr())) : nullptr; + ret.reserve(std::min(subset.size(), (std::size_t)std::distance(beg, end))); + + subset_map_.clear(); + subset_map_.resize((std::size_t)std::distance(beg, end), std::numeric_limits::max()); + std::uint64_t subset_index = 0; + for (auto it = beg; it != end; ++it) + { + if (subset.find(*it) != subset.end()) + { + subset_map_[std::distance(beg, it)] = subset_index; + ret.push_back(*it); + } + else + { + ++subset_index; + } + } + + subset_size_ = subset_index; + + return ret; + } + + template + std::vector reader_base::subset_samples(const std::vector& subset) + { + std::set unique(subset.begin(), subset.end()); + return subset_samples(unique); + } + template std::vector reader_base::prop_fields() const { @@ -595,15 +641,35 @@ namespace savvy } else { - destination.resize(gt_sz_); - const int allele_index_plus_one = allele_index_ + 1; - for (std::size_t i = 0; i < gt_sz_; ++i) + const std::uint64_t ploidy(gt_sz_ / sample_count()); + if (subset_map_.size()) { - if (gt_[i] == bcf_gt_missing) - destination[i] = std::numeric_limits::quiet_NaN(); - else if ((gt_[i] >> 1) == allele_index_plus_one) - destination[i] = alt_value; + destination.resize(subset_size_ * ploidy); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (gt_[i] == bcf_gt_missing) + destination[subset_map_[sample_index] + (i % ploidy)] = std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[subset_map_[sample_index] + (i % ploidy)] = alt_value; + } + } + } + else + { + destination.resize(gt_sz_); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (gt_[i] == bcf_gt_missing) + destination[i] = std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[i] = alt_value; + } } return; } @@ -629,15 +695,35 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / sample_count()); - destination.resize(sample_count()); - const int allele_index_plus_one = allele_index_ + 1; - for (std::size_t i = 0; i < gt_sz_; ++i) + + if (subset_map_.size()) { - if (gt_[i] == bcf_gt_missing) - destination[i / ploidy] += std::numeric_limits::quiet_NaN(); - else if ((gt_[i] >> 1) == allele_index_plus_one) - destination[i / ploidy] += alt_value; + destination.resize(subset_size_); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (gt_[i] == bcf_gt_missing) + destination[subset_map_[sample_index]] += std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[subset_map_[sample_index]] += alt_value; + } + } + } + else + { + destination.resize(sample_count()); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (gt_[i] == bcf_gt_missing) + destination[i / ploidy] += std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[i / ploidy] += alt_value; + } } return; } @@ -669,12 +755,33 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample); - destination.resize(gt_sz_); + const typename T::value_type zero_value{0}; - float* ds = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) { - destination[i] = ds[i]; + destination.resize(subset_size_); + + float* ds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (ds[i] != zero_value) + destination[subset_map_[sample_index]] = ds[i]; + } + } + } + else + { + destination.resize(gt_sz_); + + float* ds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (ds[i] != zero_value) + destination[i] = ds[i]; + } } return; } @@ -706,12 +813,33 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample); - destination.resize(gt_sz_); + const typename T::value_type zero_value{0}; + + if (subset_map_.size()) + { + destination.resize(subset_size_ * ploidy); - float* hds = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + float* hds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (hds[i] != zero_value) + destination[subset_map_[sample_index] + (i % ploidy)] = hds[i]; + } + } + } + else { - destination[i] = hds[i]; + destination.resize(gt_sz_); + + float* hds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (hds[i] != zero_value) + destination[i] = hds[i]; + } } return; } @@ -743,12 +871,29 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample - 1); - destination.resize(gt_sz_); + const std::uint64_t ploidy_plus_one = ploidy + 1; - float* gp = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) + { + destination.resize(subset_size_ * ploidy_plus_one); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy_plus_one; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (i % ploidy_plus_one)] = gp[i]; + } + } + else { - destination[i] = gp[i]; + destination.resize(gt_sz_); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = gp[i]; + } } return; } @@ -780,12 +925,29 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample - 1); - destination.resize(gt_sz_); + const std::uint64_t ploidy_plus_one = ploidy + 1; - float* gp = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) { - destination[i] = gp[i]; + destination.resize(subset_size_ * ploidy_plus_one); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy_plus_one; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (i % ploidy_plus_one)] = gp[i]; + } + } + else + { + destination.resize(gt_sz_); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = gp[i]; + } } return; } @@ -817,11 +979,28 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample - 1); - destination.resize(gt_sz_); + const std::uint64_t ploidy_plus_one = ploidy + 1; - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) { - destination[i] = gt_[i]; + destination.resize(subset_size_ * ploidy_plus_one); + + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy_plus_one; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (i % ploidy_plus_one)] = gt_[i]; + } + } + else + { + destination.resize(gt_sz_); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = gt_[i]; + } } return; } From 0256f60386c8cbd216ee57c6b58e393b79074b4f Mon Sep 17 00:00:00 2001 From: jonathonl Date: Thu, 16 Nov 2017 22:52:08 -0500 Subject: [PATCH 19/86] Adds subset test and fixes subset bug and genotype format with SAV. --- include/savvy/sav_reader.hpp | 5 +--- include/savvy/vcf_reader.hpp | 3 --- src/test.cpp | 52 ++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index a4861b5..6bfea7f 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -572,7 +572,7 @@ namespace savvy template void read_genotypes(std::size_t idx, T& destination) { - if (requested_data_formats_[idx] == file_data_format_) + if (true) //requested_data_formats_[idx] == file_data_format_) { if (requested_data_formats_[idx] == fmt::allele && file_data_format_ == fmt::allele) read_genotypes_al(destination); @@ -1348,9 +1348,6 @@ namespace savvy { subset_map_[std::distance(sample_ids_.begin(), it)] = subset_index; ret.push_back(*it); - } - else - { ++subset_index; } } diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index fe95e5a..18ddd90 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -347,9 +347,6 @@ namespace savvy { subset_map_[std::distance(beg, it)] = subset_index; ret.push_back(*it); - } - else - { ++subset_index; } } diff --git a/src/test.cpp b/src/test.cpp index 32065f9..d8ec5d2 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -698,10 +698,62 @@ void generic_reader_test() std::cout << "Elapsed Time: " << timed_call.template elapsed_time() << "ms" << std::endl; } +#define NUM_MARKERS 28 +#define NUM_ALLELES 34 + +template +class subset_test +{ +public: + void operator()(const std::string& path) + { + R rdr(path, F); + std::vector subset = {"NA00003","NA00005", "NA50000"}; + auto intersect = rdr.subset_samples(subset); + assert(intersect.size() == 2); + + savvy::site_info i; + std::vector d; + std::size_t cnt{}; + std::size_t sum{}; + while (rdr.read(i, d)) + { + assert(d.size() == 2); + sum += std::count_if(d.begin(), d.end(), [](float f) { return f > 0.0f; }); + ++cnt; + } + assert(cnt == NUM_MARKERS); + assert(sum == NUM_ALLELES); + } +}; + //#include int main(int argc, char** argv) { + assert(argc > 1); + + std::string cmd(argv[1]); + if (cmd == "subset") + { + subset_test, savvy::fmt::genotype>()("test_file.vcf"); + subset_test, savvy::fmt::genotype>()("test_file.sav"); + subset_test, savvy::fmt::genotype>()("test_file.vcf"); + subset_test, savvy::fmt::genotype>()("test_file.sav"); + } + + return EXIT_SUCCESS; + + + + + + + + + + + std::cout << "[0] Run all tests." << std::endl; std::cout << "[1] Run varint test." << std::endl; std::cout << "[2] Run file conversion test." << std::endl; From c74c8acb79468e6e660635b1943714338fbd7ef4 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 17 Nov 2017 11:53:37 -0500 Subject: [PATCH 20/86] Begins to improve tests. --- .travis.yml | 28 +++++ CMakeLists.txt | 20 +++- src/test.cpp | 269 ++++++++++++++++++------------------------------- test_file.vcf | 11 +- 4 files changed, 151 insertions(+), 177 deletions(-) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..6b653a0 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,28 @@ +language: cpp +compiler: gcc +dist: trusty +sudo: required +group: edge +branches: + only: + - develop +addons: + apt: + sources: + - george-edison55-precise-backports + - ubuntu-toolchain-r-test + packages: + - cmake-data + - cmake + - python-dev + - python-pip +install: + - sudo pip install cget + - cget install -f ./requirements.txt +script: + - cmake --version + - mkdir build && cd build + - cmake -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake .. + - cd .. + - make -C build + - make CTEST_OUTPUT_ON_FAILURE=1 test -C buld \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f2fdbf..ddbaf8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,9 +33,6 @@ add_library(savvy ${SOURCE_FILES} ${HEADER_FILES}) target_link_libraries(savvy ${HTS_LIBRARY} ${ZLIB_LIBRARY} ${ZSTD_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(savvy PUBLIC $ $) -add_executable(savvy-test src/test.cpp src/test_class.cpp include/savvy/test_class.hpp) -target_link_libraries(savvy-test savvy) - add_executable(bcf2m3vcf src/bcf2m3vcf.cpp) target_link_libraries(bcf2m3vcf savvy) @@ -51,6 +48,23 @@ target_link_libraries(sav-sample-sort savvy) add_executable(savvy-speed-test src/savvy_speed_test.cpp) target_link_libraries(savvy-speed-test savvy) + +if(BUILD_TESTS) + enable_testing() + + add_definitions(-DSAVVYT_VCF_FILE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file.vcf\" + -DSAVVYT_SAV_FILE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file.sav\" + -DSAVVYT_MARKER_COUNT=28 + -DSAVVYT_ALLELE_COUNT=34) + + add_executable(savvy-test src/test.cpp src/test_class.cpp include/savvy/test_class.hpp) + target_link_libraries(savvy-test savvy) + + add_test(convert_file_test savvy-test convert-file) + add_test(subset_test savvy-test subset) + add_test(varint_test savvy-test varint) +endif() + if(BUILD_SLR_EXAMPLES) add_executable(slr-examples src/slr_examples.cpp) target_link_libraries(slr-examples savvy armadillo) diff --git a/src/test.cpp b/src/test.cpp index d8ec5d2..b83cd3b 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -164,6 +165,12 @@ // return 0; //} +bool file_exists(const std::string& file_path) +{ + struct stat st; + return (stat(file_path.c_str(), &st) == 0); +} + int varint_test() { std::vector arr(0xFFFFFF); @@ -541,56 +548,6 @@ void convert_file_test() run_file_checksum_test(); } -//template -//double inner_product(const M& mrkr, std::vector& vec, const double start_val = 0.0) -//{ -// double ret = start_val; -// -// std::size_t i = 0; -// for (const savvy::allele_status& gt : mrkr) -// { -// if (gt == savvy::allele_status::has_alt) -// ret += 1.0 * vec[i]; -// ++i; -// } -// -// return ret; -//} -// -//template <> -//double inner_product(const savvy::sav::marker& mrkr, std::vector& vec, const double start_val) -//{ -// double ret = start_val; -// -// for (auto it = mrkr.non_ref_begin(); it != mrkr.non_ref_end(); ++it) -// { -// if (it->status == savvy::allele_status::has_alt) -// ret += 1.0 * vec[it->offset]; -// else -// ret += 0.04 * vec[it->offset]; -// } -// -// return ret; -//} - -//class file_handler_functor -//{ -//public: -// template -// void operator()(T&& input_file_reader) -// { -// typedef typename T::input_iterator input_iterator_type; -// typename T::input_iterator::buffer buf{}; -// -// for (auto it = input_iterator_type(input_file_reader, buf); it != input_iterator_type(); ++it) -// { -// inner_product(*it, phenotypes_); -// } -// }; -//private: -// std::vector phenotypes_; -//}; - class triple_file_handler_functor { public: @@ -668,21 +625,59 @@ void random_access_test() savvy::indexed_reader<1> rdr("test_file.sav", {"20", 17000, 1120000}, savvy::fmt::allele); savvy::site_info anno; - std::vector b; + std::vector buf; - while (rdr.read(anno, b)) - { - std::cout << anno.chromosome() << " " << anno.position() << " " << anno.ref() << " " << anno.alt() << std::endl; - } + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "20"); + assert(anno.position() == 17330); + assert(anno.ref() == "T"); + assert(anno.alt() == "A"); + + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "20"); + assert(anno.position() == 1110696); + assert(anno.ref() == "A"); + assert(anno.alt() == "G"); + + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "20"); + assert(anno.position() == 1110696); + assert(anno.ref() == "A"); + assert(anno.alt() == "T"); - std::cout << "--------------------------------" << std::endl; + assert(rdr.good()); + + assert(!rdr.read(anno, buf)); rdr.reset_region({"18", 2234600, 2234700}); - std::vector v; - while (rdr.read(anno, v)) - { - std::cout << anno.chromosome() << " " << anno.position() << " " << anno.ref() << " " << anno.alt() << std::endl; - } + + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "18"); + assert(anno.position() == 2234668); + assert(anno.ref() == "G"); + assert(anno.alt() == "A"); + + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "18"); + assert(anno.position() == 2234679); + assert(anno.ref() == "G"); + assert(anno.alt() == "T"); + + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "18"); + assert(anno.position() == 2234687); + assert(anno.ref() == "G"); + assert(anno.alt() == "A"); + + assert(rdr.read(anno, buf)); + assert(anno.chromosome() == "18"); + assert(anno.position() == 2234697); + assert(anno.ref() == "T"); + assert(anno.alt() == "A"); + + assert(rdr.good()); + + assert(!rdr.read(anno, buf)); } void generic_reader_test() @@ -698,9 +693,6 @@ void generic_reader_test() std::cout << "Elapsed Time: " << timed_call.template elapsed_time() << "ms" << std::endl; } -#define NUM_MARKERS 28 -#define NUM_ALLELES 34 - template class subset_test { @@ -708,7 +700,9 @@ class subset_test void operator()(const std::string& path) { R rdr(path, F); - std::vector subset = {"NA00003","NA00005", "NA50000"}; + assert(rdr.good()); + + std::vector subset = {"NA00003","NA00005", "FAKE_ID"}; auto intersect = rdr.subset_samples(subset); assert(intersect.size() == 2); @@ -722,126 +716,63 @@ class subset_test sum += std::count_if(d.begin(), d.end(), [](float f) { return f > 0.0f; }); ++cnt; } - assert(cnt == NUM_MARKERS); - assert(sum == NUM_ALLELES); + assert(cnt == SAVVYT_MARKER_COUNT); + assert(sum == SAVVYT_ALLELE_COUNT); } }; -//#include - int main(int argc, char** argv) { - assert(argc > 1); + std::string cmd = (argc < 2) ? "" : argv[1]; - std::string cmd(argv[1]); - if (cmd == "subset") + if (cmd.empty()) { - subset_test, savvy::fmt::genotype>()("test_file.vcf"); - subset_test, savvy::fmt::genotype>()("test_file.sav"); - subset_test, savvy::fmt::genotype>()("test_file.vcf"); - subset_test, savvy::fmt::genotype>()("test_file.sav"); + std::cout << "Enter Command:" << std::endl; + std::cout << "- convert-file" << std::endl; + std::cout << "- subset" << std::endl; + std::cout << "- varint" << std::endl; + std::cout << "- generic-reader" << std::endl; + std::cout << "- random-access" << std::endl; + std::cin >> cmd; } - return EXIT_SUCCESS; - - - - - - - - - - - std::cout << "[0] Run all tests." << std::endl; - std::cout << "[1] Run varint test." << std::endl; - std::cout << "[2] Run file conversion test." << std::endl; - std::cout << "[3] Run generic reader test." << std::endl; - std::cout << "[4] Run random access test." << std::endl; - - char t = '0'; - std::cin >> t; - - switch (t) + if (cmd == "convert-file") { - case '0': - varint_test(); - convert_file_test(); - break; - case '1': - varint_test(); - break; - case '2': - convert_file_test(); - break; - case '3': - generic_reader_test(); - break; - case '4': - random_access_test(); - break; - default: - std::cout << "Invalid Input" << std::endl; + convert_file_test(); } + else if (cmd == "subset") + { + if (!file_exists(SAVVYT_SAV_FILE)) convert_file_test(); + subset_test, savvy::fmt::genotype>()(SAVVYT_VCF_FILE); + subset_test, savvy::fmt::genotype>()(SAVVYT_SAV_FILE); + subset_test, savvy::fmt::genotype>()(SAVVYT_VCF_FILE); + subset_test, savvy::fmt::genotype>()(SAVVYT_SAV_FILE); + } + else if (cmd == "varint") + { + varint_test(); + } + else if (cmd == "random-access") + { + if (!file_exists(SAVVYT_SAV_FILE)) convert_file_test(); + random_access_test(); + } + else if (cmd == "generic-reader") + { + if (!file_exists(SAVVYT_SAV_FILE)) convert_file_test(); -// savvy::open_marker_files(triple_file_handler_functor(), "chr1.bcf", "chr1.cmf", "chr1.m3vcf"); -// -// savvy::open_marker_files(std::make_tuple("chr1.cmf", "chr1.m3vcf"), [](auto&& input_file_reader1, auto&& input_file_reader2) -// { -// typedef typename std::remove_reference::type R1; -// typename R1::input_iterator::buffer buf{}; -// typename R1::input_iterator eof{}; -// typename R1::input_iterator it(input_file_reader1, buf); -// -// typedef typename std::remove_reference::type R2; -// typename R2::input_iterator::buffer buf2{}; -// typename R2::input_iterator eof2{}; -// typename R2::input_iterator it2(input_file_reader2, buf2); -// -// while (it != eof) -// { -// -// ++it; -// } -// -// while (it2 != eof2) -// { -// -// ++it2; -// } -// -// }); -// -// savvy::open_marker_file("chr1.bcf", [](auto&& input_file_reader) -// { -// typedef typename std::remove_reference::type R; -// typename R::input_iterator::buffer buf{}; -// typename R::input_iterator eof{}; -// typename R::input_iterator it(input_file_reader, buf); -// -// while (it != eof) -// { -// it->pos(); -// it->ref() + ":" + it->alt(); -// for (const savvy::allele_status& gt : *it) -// { -// -// } -// ++it; -// } -// }); -// -// savvy::open_marker_file("chr1.bcf", file_handler_functor()); -// file_handler_functor f; -// savvy::open_marker_file("chr1.bcf", f); -// -// savvy::iterate_marker_file("chr1.bcf", marker_handler_functor()); - + generic_reader_test(); + } + else + { + std::cerr << "Invalid Command" << std::endl; + return EXIT_FAILURE; + } - return 0; + return EXIT_SUCCESS; } diff --git a/test_file.vcf b/test_file.vcf index 8a6c33a..881a16a 100644 --- a/test_file.vcf +++ b/test_file.vcf @@ -18,12 +18,13 @@ ##FORMAT= ##FORMAT= ##FORMAT= +##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 NA00005 NA00006 -18 2234668 . G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 0|0:48:8:51,51 0/1:43:5:.,. 0/1:35:4 0/0:17:2 1/0:40:3 -18 2234679 . G T 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 0|0:48:8:51,51 1/0:43:5:.,. 0/1:35:4 0/1:17:2 0/1:40:3 -18 2234687 . G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 0|0:48:8:51,51 0/0:43:5:.,. 1/0:35:4 0/0:17:2 0/0:40:3 -18 2234697 . T A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 0|0:48:8:51,51 1/0:43:5:.,. 0/0:35:4 0/0:17:2 0/0:40:3 -18 2234767 . T G 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 0/1:43:5:.,. 0/1:35:4 0/0:17:2 1/1:40:3 +18 2234668 . G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 0|0:0.0,0.0:48:8:51,51 0/1:0.0,9.0:43:5:.,. 0/1:0.0,1.0:35:4 0/0:0.0,0.0:17:2 1/0:1.0,0.0:40:3 +18 2234679 . G T 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 0|0:0.0,0.0:48:8:51,51 1/0:1.0,0.0:43:5:.,. 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 0/1:0.0,1.0:40:3 +18 2234687 . G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 0|0:0.0,0.0:48:8:51,51 0/0:0.0,0.0:43:5:.,. 1/0:1.0,0.0:35:4 0/0:0.0,0.0:17:2 0/0:0.0,0.0:40:3 +18 2234697 . T A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 0|0:0.0,0.0:48:8:51,51 1/0:1.0,0.0:43:5:.,. 0/0:0.0,0.0:35:4 0/0:0.0,0.0:17:2 0/0:0.0,0.0:40:3 +18 2234767 . T G 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 0/1:0.0,1.0:43:5:.,. 0/1:0.0,1.0:35:4 0/0:0.0,0.0:17:2 1/1:1.0,0.9:40:3 20 14360 . G AC 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:GP 0|0:48:1:0.999,0.001,0 0|0:48:8:1,0,0 1/1:43:5:0,0.2,0.8 0/1:35:4:0.05,0.95,0 1/1:17:2:0,0,1 0/0:40:3:0.8,0.05,0.15 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 ./1:43:5:.,. 0/1:35:4 1/1:17:2 1/1:40:3 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 0/1:35:4 0/1:17:2 1/1:40:3 From 72f4f50e6426853cf8b33a8fb2649c7b6885ca0c Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 17 Nov 2017 12:00:00 -0500 Subject: [PATCH 21/86] Updates travis config. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6b653a0..e1065a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ install: script: - cmake --version - mkdir build && cd build - - cmake -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake .. + - cmake -DBUILD_TESTS=1 -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake .. - cd .. - make -C build - make CTEST_OUTPUT_ON_FAILURE=1 test -C buld \ No newline at end of file From b4f9d06b0fc64289c14ce5c0ab2276c3fb0e882a Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 17 Nov 2017 12:08:51 -0500 Subject: [PATCH 22/86] Updates tests. --- .travis.yml | 5 ++--- src/test.cpp | 22 ++++------------------ 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/.travis.yml b/.travis.yml index e1065a7..8ec0d89 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,5 @@ script: - cmake --version - mkdir build && cd build - cmake -DBUILD_TESTS=1 -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake .. - - cd .. - - make -C build - - make CTEST_OUTPUT_ON_FAILURE=1 test -C buld \ No newline at end of file + - make + - make CTEST_OUTPUT_ON_FAILURE=1 test \ No newline at end of file diff --git a/src/test.cpp b/src/test.cpp index b83cd3b..54876f3 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -502,8 +502,8 @@ file_checksum_test make_file_checksum_test(T1& a, T2& b) void run_file_checksum_test() { - savvy::reader<1> input_file_reader1("test_file.vcf", savvy::fmt::allele); - savvy::reader<1> input_file_reader2("test_file.sav", savvy::fmt::allele); + savvy::reader<1> input_file_reader1(SAVVYT_VCF_FILE, savvy::fmt::allele); + savvy::reader<1> input_file_reader2(SAVVYT_SAV_FILE, savvy::fmt::allele); auto t = make_file_checksum_test(input_file_reader1, input_file_reader2); std::cout << "Starting checksum test ..." << std::endl; auto timed_call = time_procedure(t); @@ -514,7 +514,7 @@ void run_file_checksum_test() void convert_file_test() { { - savvy::vcf::reader<1> input("test_file.vcf", savvy::fmt::allele); + savvy::vcf::reader<1> input(SAVVYT_VCF_FILE, savvy::fmt::allele); savvy::site_info anno; std::vector data; @@ -523,24 +523,10 @@ void convert_file_test() file_info.insert(file_info.begin(), {"INFO",""}); file_info.insert(file_info.begin(), {"INFO",""}); file_info.insert(file_info.begin(), {"INFO",""}); - savvy::sav::writer compact_output("test_file.sav", input.samples_begin(), input.samples_end(), file_info.begin(), file_info.end()); + savvy::sav::writer compact_output(SAVVYT_SAV_FILE, input.samples_begin(), input.samples_end(), file_info.begin(), file_info.end()); while (input.read(anno, data)) { - // savvy::allele_vector> m(std::string(chrom), cur->locus(), std::string(cur->ref()), std::string(cur->alt()), sample_ids.size(), ploidy, std::vector()); - // for (auto it = cur->begin(); it != cur->end(); ++it) - // { - // switch (*it) - // { - // case savvy::allele_status::has_alt: - // m[std::distance(cur->begin(), it)] = 1.0; - // break; - // case savvy::allele_status::is_missing: - // m[std::distance(cur->begin(), it)] = std::numeric_limits::quiet_NaN(); - // break; - // } - // } - compact_output.write(anno, data); } } From 96455dec0ff3e844216cba0bbcfb6108c44bfac7 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 17 Nov 2017 12:28:01 -0500 Subject: [PATCH 23/86] Removes vector overload to subset_samples methods. --- README.md | 14 ++++++++++++++ include/savvy/reader.hpp | 11 ----------- include/savvy/sav_reader.hpp | 8 -------- include/savvy/vcf_reader.hpp | 8 -------- src/test.cpp | 2 +- 5 files changed, 15 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 5b8c46f..25d3b70 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,20 @@ while (f.read(anno, alleles)) } ``` +## Subsetting Samples +```c++ +savvy::reader<1> f("chr1.sav", savvy::fmt::allele); +std::vector requested = {"ID001","ID002","ID003"}; +std::vector intersect = f.subset_samples({requested.begin(), requested.end()}); + +savvy::site_info anno; +std::vector alleles; +while (f.read(anno, alleles)) +{ + ... +} +``` + ## Multiple Data Vectors ```c++ savvy::reader<2> f("chr1.bcf", savvy::fmt::genotype, savvy::fmt::dosage); diff --git a/include/savvy/reader.hpp b/include/savvy/reader.hpp index cb995b2..f2f008b 100644 --- a/include/savvy/reader.hpp +++ b/include/savvy/reader.hpp @@ -148,7 +148,6 @@ namespace savvy std::size_t sample_size() const; std::vector subset_samples(const std::set& subset); - std::vector subset_samples(const std::vector& subset); protected: virtual savvy::sav::reader_base* sav_impl() const = 0; virtual savvy::vcf::reader_base* vcf_impl() const = 0; @@ -284,16 +283,6 @@ namespace savvy return vcf_impl()->subset_samples(subset); return std::vector(); } - - template - std::vector reader_base::subset_samples(const std::vector& subset) - { - if (sav_impl()) - return sav_impl()->subset_samples(subset); - else if (vcf_impl()) - return vcf_impl()->subset_samples(subset); - return std::vector(); - } //################################################################// //################################################################// diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 6bfea7f..9dab4e5 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -99,7 +99,6 @@ namespace savvy * @return intersect of subset and samples IDs in file. */ std::vector subset_samples(const std::set& subset); - std::vector subset_samples(const std::vector& subset); const std::string& file_path() const { return file_path_; } std::streampos tellg() { return this->input_stream_.tellg(); } @@ -1357,13 +1356,6 @@ namespace savvy return ret; } - template - std::vector reader_base::subset_samples(const std::vector& subset) - { - std::set unique(subset.begin(), subset.end()); - return subset_samples(unique); - } - template <> template inline std::tuple detail::allele_decoder<0>::decode(std::istreambuf_iterator& in_it, const std::istreambuf_iterator& end_it, const T& missing_value) diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index 18ddd90..b9dff22 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -85,7 +85,6 @@ namespace savvy const char** samples_end() const; std::vector subset_samples(const std::set& subset); - std::vector subset_samples(const std::vector& subset); std::vector prop_fields() const; std::vector> headers() const; @@ -356,13 +355,6 @@ namespace savvy return ret; } - template - std::vector reader_base::subset_samples(const std::vector& subset) - { - std::set unique(subset.begin(), subset.end()); - return subset_samples(unique); - } - template std::vector reader_base::prop_fields() const { diff --git a/src/test.cpp b/src/test.cpp index 54876f3..d20ec6b 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -689,7 +689,7 @@ class subset_test assert(rdr.good()); std::vector subset = {"NA00003","NA00005", "FAKE_ID"}; - auto intersect = rdr.subset_samples(subset); + auto intersect = rdr.subset_samples({subset.begin(), subset.end()}); assert(intersect.size() == 2); savvy::site_info i; From b99d59a98ce7ec2ab55dcc2789e8b0cf975d0304 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Fri, 17 Nov 2017 17:27:53 -0500 Subject: [PATCH 24/86] Adds automatic conversion to requested format for SAV files. --- CMakeLists.txt | 5 +- include/savvy/data_format.hpp | 14 +++ include/savvy/sav_reader.hpp | 171 ++++++++++++++++++++++------------ src/test.cpp | 167 +++++++++++++++++++-------------- test_file.vcf | 30 +++--- 5 files changed, 240 insertions(+), 147 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddbaf8c..7ec8426 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,9 +53,10 @@ if(BUILD_TESTS) enable_testing() add_definitions(-DSAVVYT_VCF_FILE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file.vcf\" - -DSAVVYT_SAV_FILE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file.sav\" + -DSAVVYT_SAV_FILE_HARD=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file_hard.sav\" + -DSAVVYT_SAV_FILE_DOSE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file_dose.sav\" -DSAVVYT_MARKER_COUNT=28 - -DSAVVYT_ALLELE_COUNT=34) + -DSAVVYT_ALLELE_COUNT=40) add_executable(savvy-test src/test.cpp src/test_class.cpp include/savvy/test_class.hpp) target_link_libraries(savvy-test savvy) diff --git a/include/savvy/data_format.hpp b/include/savvy/data_format.hpp index 728ee71..5141a7c 100644 --- a/include/savvy/data_format.hpp +++ b/include/savvy/data_format.hpp @@ -23,5 +23,19 @@ namespace savvy // ds = dosage // ec = dosage }; + + inline std::uint64_t sample_stride(fmt format, std::uint64_t ploidy) + { + switch (format) + { + case fmt::allele: return ploidy; + case fmt::genotype: return 1; + case fmt::genotype_probability: return ploidy + 1; + case fmt::genotype_likelihood: return ploidy + 1; + case fmt::phred_scaled_genotype_likelihood: return ploidy + 1; + case fmt::dosage: return 1; + case fmt::haplotype_dosage: return ploidy; + } + } } #endif //LIBSAVVY_DATA_FORMAT_HPP diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 9dab4e5..64dd26d 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -249,15 +249,13 @@ namespace savvy this->discard_genotypes_impl<7>(); } - template + template void read_genotypes_al(T& destination) { - if (file_data_format_ != fmt::allele) - input_stream_.setstate(std::ios::badbit); - if (good()) { const typename T::value_type alt_value = typename T::value_type(1); + const auto missing_value = std::numeric_limits::quiet_NaN(); std::istreambuf_iterator in_it(input_stream_); std::istreambuf_iterator end_it; @@ -276,29 +274,60 @@ namespace savvy { destination.resize(subset_size_ * ploidy_level); - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + if (BitWidth == 1) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - - const std::uint64_t sample_index = total_offset / ploidy_level; - if (subset_map_[sample_index] != std::numeric_limits::max()) - destination[subset_map_[sample_index] + (total_offset % ploidy_level)] = allele; //(allele ? missing_value : alt_value); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (total_offset % ploidy_level)] = allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] + (total_offset % ploidy_level)] = std::round(allele); //(allele ? missing_value : alt_value); + } } } else { destination.resize(sample_count() * ploidy_level); - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + if (BitWidth == 1) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset] = allele; //(allele ? missing_value : alt_value); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset] = allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset] = std::round(allele); //(allele ? missing_value : alt_value); + } } } @@ -307,15 +336,13 @@ namespace savvy } } - template + template void read_genotypes_gt(T& destination) { - if (file_data_format_ != fmt::allele) - input_stream_.setstate(std::ios::failbit); - if (good()) { const typename T::value_type alt_value{1}; + const auto missing_value = std::numeric_limits::quiet_NaN(); std::istreambuf_iterator in_it(input_stream_); std::istreambuf_iterator end_it; @@ -334,29 +361,60 @@ namespace savvy { destination.resize(subset_size_); - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + if (BitWidth == 1) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - - const std::uint64_t sample_index = total_offset / ploidy_level; - if (subset_map_[sample_index] != std::numeric_limits::max()) - destination[subset_map_[sample_index]] += allele; //(allele ? missing_value : alt_value); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += std::round(allele); //(allele ? missing_value : alt_value); + } } } else { destination.resize(sample_count()); - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + if (BitWidth == 1) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset / ploidy_level] += allele; //(allele ? missing_value : alt_value); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset / ploidy_level] += allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset / ploidy_level] += std::round(allele); //(allele ? missing_value : alt_value); + } } } @@ -418,12 +476,9 @@ namespace savvy // } // } - template + template void read_genotypes_hds(T& destination) { - if (file_data_format_ != fmt::haplotype_dosage) - input_stream_.setstate(std::ios::failbit); - if (good()) { const typename T::value_type alt_value = typename T::value_type(1); @@ -450,7 +505,7 @@ namespace savvy { typename T::value_type allele; std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); total_offset += offset; @@ -467,7 +522,7 @@ namespace savvy { typename T::value_type allele; std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); total_offset += offset; @@ -481,15 +536,13 @@ namespace savvy } } - template + template void read_genotypes_ds(T& destination) { - if (file_data_format_ != fmt::haplotype_dosage) - input_stream_.setstate(std::ios::failbit); - if (good()) { - const typename T::value_type alt_value{1}; + const typename T::value_type alt_value(1); + const typename T::value_type missing_value(std::numeric_limits::quiet_NaN()); std::istreambuf_iterator in_it(input_stream_); std::istreambuf_iterator end_it; @@ -512,7 +565,7 @@ namespace savvy { typename T::value_type allele; std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); total_offset += offset; const std::uint64_t sample_index = total_offset / ploidy_level; @@ -528,7 +581,7 @@ namespace savvy { typename T::value_type allele; std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); total_offset += offset; destination[total_offset / ploidy_level] += allele; } @@ -573,16 +626,16 @@ namespace savvy { if (true) //requested_data_formats_[idx] == file_data_format_) { - if (requested_data_formats_[idx] == fmt::allele && file_data_format_ == fmt::allele) - read_genotypes_al(destination); - else if (requested_data_formats_[idx] == fmt::genotype && file_data_format_ == fmt::allele) - read_genotypes_gt(destination); + if (requested_data_formats_[idx] == fmt::allele) + file_data_format_ == fmt::allele ? read_genotypes_al<1>(destination) : read_genotypes_al<7>(destination); + else if (requested_data_formats_[idx] == fmt::genotype) + file_data_format_ == fmt::allele ? read_genotypes_gt<1>(destination) : read_genotypes_gt<7>(destination); // else if (requested_data_formats_[idx] == fmt::genotype_probability && file_data_format_ == fmt::genotype_probability) // read_genotypes_gp(destination); - else if (requested_data_formats_[idx] == fmt::dosage && file_data_format_ == fmt::haplotype_dosage) - read_genotypes_ds(destination); - else if (requested_data_formats_[idx] == fmt::haplotype_dosage && file_data_format_ == fmt::haplotype_dosage) - read_genotypes_hds(destination); + else if (requested_data_formats_[idx] == fmt::dosage) + file_data_format_ == fmt::allele ? read_genotypes_ds<1>(destination) : read_genotypes_ds<7>(destination); + else if (requested_data_formats_[idx] == fmt::haplotype_dosage) + file_data_format_ == fmt::allele ? read_genotypes_hds<1>(destination) : read_genotypes_hds<7>(destination); else input_stream_.setstate(std::ios::failbit); } diff --git a/src/test.cpp b/src/test.cpp index d20ec6b..eb8abac 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -8,6 +8,7 @@ #include "savvy/variant_iterator.hpp" #include "savvy/reader.hpp" #include "savvy/site_info.hpp" +#include "savvy/data_format.hpp" #include #include @@ -503,7 +504,7 @@ file_checksum_test make_file_checksum_test(T1& a, T2& b) void run_file_checksum_test() { savvy::reader<1> input_file_reader1(SAVVYT_VCF_FILE, savvy::fmt::allele); - savvy::reader<1> input_file_reader2(SAVVYT_SAV_FILE, savvy::fmt::allele); + savvy::reader<1> input_file_reader2(SAVVYT_SAV_FILE_HARD, savvy::fmt::allele); auto t = make_file_checksum_test(input_file_reader1, input_file_reader2); std::cout << "Starting checksum test ..." << std::endl; auto timed_call = time_procedure(t); @@ -511,28 +512,42 @@ void run_file_checksum_test() std::cout << "Elapsed Time: " << timed_call.template elapsed_time() << "ms" << std::endl; } -void convert_file_test() +template +class convert_file_test { +public: + void operator()() { - savvy::vcf::reader<1> input(SAVVYT_VCF_FILE, savvy::fmt::allele); - savvy::site_info anno; - std::vector data; - - auto file_info = input.headers(); - file_info.reserve(file_info.size() + 3); - file_info.insert(file_info.begin(), {"INFO",""}); - file_info.insert(file_info.begin(), {"INFO",""}); - file_info.insert(file_info.begin(), {"INFO",""}); - savvy::sav::writer compact_output(SAVVYT_SAV_FILE, input.samples_begin(), input.samples_end(), file_info.begin(), file_info.end()); - - while (input.read(anno, data)) { - compact_output.write(anno, data); + savvy::vcf::reader<1> input(SAVVYT_VCF_FILE, Fmt); + savvy::site_info anno; + std::vector data; + + auto file_info = input.headers(); + file_info.reserve(file_info.size() + 3); + file_info.insert(file_info.begin(), {"INFO", ""}); + file_info.insert(file_info.begin(), {"INFO", ""}); + file_info.insert(file_info.begin(), {"INFO", ""}); + savvy::sav::writer::options opts{}; + opts.data_format = Fmt; + savvy::sav::writer output(Fmt == savvy::fmt::haplotype_dosage ? SAVVYT_SAV_FILE_DOSE : SAVVYT_SAV_FILE_HARD, input.samples_begin(), input.samples_end(), file_info.begin(), file_info.end(), opts); + + std::size_t cnt = 0; + while (input.read(anno, data)) + { + output.write(anno, data); + ++cnt; + } + + assert(output.good()); + assert(cnt == SAVVYT_MARKER_COUNT); } - } - run_file_checksum_test(); -} + if (Fmt != savvy::fmt::haplotype_dosage) + run_file_checksum_test(); + savvy::sav::writer::create_index(Fmt == savvy::fmt::haplotype_dosage ? SAVVYT_SAV_FILE_DOSE : SAVVYT_SAV_FILE_HARD); + } +}; class triple_file_handler_functor { @@ -605,11 +620,10 @@ class marker_handler_functor // std::size_t file2_cnt_ = 0; //}; -void random_access_test() -{ - savvy::sav::writer::create_index("test_file.sav"); - savvy::indexed_reader<1> rdr("test_file.sav", {"20", 17000, 1120000}, savvy::fmt::allele); +void sav_random_access_test(savvy::fmt format) +{ + savvy::sav::indexed_reader<1> rdr(format == savvy::fmt::allele ? SAVVYT_SAV_FILE_HARD : SAVVYT_SAV_FILE_DOSE, {"20", 17000, 1120000}, format); savvy::site_info anno; std::vector buf; @@ -666,46 +680,49 @@ void random_access_test() assert(!rdr.read(anno, buf)); } -void generic_reader_test() +void generic_reader_test(const std::string& path, savvy::fmt format) { - savvy::reader<1> rdr1("test_file.sav", savvy::fmt::allele); - savvy::reader<1> rdr2("test_file.vcf", savvy::fmt::allele); + savvy::reader<1> rdr(path, format); + assert(rdr.good()); + std::vector subset = {"NA00003","NA00005", "FAKE_ID"}; + auto intersect = rdr.subset_samples({subset.begin(), subset.end()}); + assert(intersect.size() == 2); - auto t = make_file_checksum_test(rdr1, rdr2); - std::cout << "Starting checksum test ..." << std::endl; - auto timed_call = time_procedure(t); - std::cout << "Returned: " << (timed_call.return_value() ? "True" : "FALSE") << std::endl; - std::cout << "Elapsed Time: " << timed_call.template elapsed_time() << "ms" << std::endl; + savvy::site_info i; + std::vector d; + std::size_t cnt{}; + std::size_t sum{}; + while (rdr.read(i, d)) + { + assert(d.size() == intersect.size() * savvy::sample_stride(format, 2)); + sum += std::count_if(d.begin(), d.end(), [](float f) { return f > 0.0f; }); + ++cnt; + } + assert(cnt == SAVVYT_MARKER_COUNT); + assert(sum == SAVVYT_ALLELE_COUNT); } template -class subset_test +void subset_test(const std::string& path) { -public: - void operator()(const std::string& path) + R rdr(path, F); + assert(rdr.good()); + + std::vector subset = {"NA00003","NA00005", "FAKE_ID"}; + auto intersect = rdr.subset_samples({subset.begin(), subset.end()}); + assert(intersect.size() == 2); + + savvy::site_info i; + std::vector d; + std::size_t cnt{}; + while (rdr.read(i, d)) { - R rdr(path, F); - assert(rdr.good()); - - std::vector subset = {"NA00003","NA00005", "FAKE_ID"}; - auto intersect = rdr.subset_samples({subset.begin(), subset.end()}); - assert(intersect.size() == 2); - - savvy::site_info i; - std::vector d; - std::size_t cnt{}; - std::size_t sum{}; - while (rdr.read(i, d)) - { - assert(d.size() == 2); - sum += std::count_if(d.begin(), d.end(), [](float f) { return f > 0.0f; }); - ++cnt; - } - assert(cnt == SAVVYT_MARKER_COUNT); - assert(sum == SAVVYT_ALLELE_COUNT); + assert(d.size() == intersect.size() * savvy::sample_stride(F, 2)); + ++cnt; } -}; + assert(cnt == SAVVYT_MARKER_COUNT); +} int main(int argc, char** argv) { @@ -715,42 +732,50 @@ int main(int argc, char** argv) { std::cout << "Enter Command:" << std::endl; std::cout << "- convert-file" << std::endl; - std::cout << "- subset" << std::endl; - std::cout << "- varint" << std::endl; std::cout << "- generic-reader" << std::endl; std::cout << "- random-access" << std::endl; + std::cout << "- subset" << std::endl; + std::cout << "- varint" << std::endl; std::cin >> cmd; } if (cmd == "convert-file") { - convert_file_test(); + convert_file_test()(); + convert_file_test()(); } - else if (cmd == "subset") + else if (cmd == "generic-reader") { - if (!file_exists(SAVVYT_SAV_FILE)) convert_file_test(); + if (!file_exists(SAVVYT_SAV_FILE_HARD)) convert_file_test()(); + if (!file_exists(SAVVYT_SAV_FILE_DOSE)) convert_file_test()(); - subset_test, savvy::fmt::genotype>()(SAVVYT_VCF_FILE); - subset_test, savvy::fmt::genotype>()(SAVVYT_SAV_FILE); - subset_test, savvy::fmt::genotype>()(SAVVYT_VCF_FILE); - subset_test, savvy::fmt::genotype>()(SAVVYT_SAV_FILE); - } - else if (cmd == "varint") - { - varint_test(); + for (auto path : {SAVVYT_SAV_FILE_HARD}) //, SAVVYT_SAV_FILE_DOSE}) + { + for (savvy::fmt format : {savvy::fmt::allele, savvy::fmt::haplotype_dosage}) + generic_reader_test(path, format); + } } else if (cmd == "random-access") { - if (!file_exists(SAVVYT_SAV_FILE)) convert_file_test(); + if (!file_exists(SAVVYT_SAV_FILE_HARD)) convert_file_test()(); + if (!file_exists(SAVVYT_SAV_FILE_DOSE)) convert_file_test()(); - random_access_test(); + sav_random_access_test(savvy::fmt::allele); + sav_random_access_test(savvy::fmt::haplotype_dosage); } - else if (cmd == "generic-reader") + else if (cmd == "subset") { - if (!file_exists(SAVVYT_SAV_FILE)) convert_file_test(); + if (!file_exists(SAVVYT_SAV_FILE_HARD)) convert_file_test()(); - generic_reader_test(); + subset_test, savvy::fmt::genotype>(SAVVYT_VCF_FILE); + subset_test, savvy::fmt::genotype>(SAVVYT_SAV_FILE_HARD); + subset_test, savvy::fmt::genotype>(SAVVYT_VCF_FILE); + subset_test, savvy::fmt::genotype>(SAVVYT_SAV_FILE_HARD); + } + else if (cmd == "varint") + { + varint_test(); } else { diff --git a/test_file.vcf b/test_file.vcf index 881a16a..bc12808 100644 --- a/test_file.vcf +++ b/test_file.vcf @@ -25,22 +25,22 @@ 18 2234687 . G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 0|0:0.0,0.0:48:8:51,51 0/0:0.0,0.0:43:5:.,. 1/0:1.0,0.0:35:4 0/0:0.0,0.0:17:2 0/0:0.0,0.0:40:3 18 2234697 . T A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 0|0:0.0,0.0:48:8:51,51 1/0:1.0,0.0:43:5:.,. 0/0:0.0,0.0:35:4 0/0:0.0,0.0:17:2 0/0:0.0,0.0:40:3 18 2234767 . T G 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 0/1:0.0,1.0:43:5:.,. 0/1:0.0,1.0:35:4 0/0:0.0,0.0:17:2 1/1:1.0,0.9:40:3 -20 14360 . G AC 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:GP 0|0:48:1:0.999,0.001,0 0|0:48:8:1,0,0 1/1:43:5:0,0.2,0.8 0/1:35:4:0.05,0.95,0 1/1:17:2:0,0,1 0/0:40:3:0.8,0.05,0.15 -20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 ./1:43:5:.,. 0/1:35:4 1/1:17:2 1/1:40:3 -20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 0/1:35:4 0/1:17:2 1/1:40:3 +20 14360 . G AC 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:GP 0|0:0.0,0.0:48:1:0.999,0.001,0 0|0:0.0,0.0:48:8:1,0,0 1/1:1.0,1.0:43:5:0,0.2,0.8 0/1:0.0,1.0:35:4:0.05,0.95,0 1/1:1.0,1.0:17:2:0,0,1 0/0:0.0,0.0:40:3:0.8,0.05,0.15 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 ./1:43:5:.,. 0/1:0.0,1.0:35:4 1/1:1.0,1.0:17:2 1/1:1.0,1.0:40:3 +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:49:3:58,50 0|1:0.0,1.0:3:5:65,3 0/0:0.0,0.0:41:3 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 1/1:1.0,1.0:40:3 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 0/1:35:4 0/1:17:2 1/1:40:3 -20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 0/1:35:4 0/1:17:2 1/1:40:3 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:54:7:56,60 0|0:0.0,0.0:48:4:51,51 0/0:0.0,0.0:61:2 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 1/1:1.0,1.0:40:3 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/0:40:3 2/1:35:4 1/2:17:2 1/1:40:3 -20 1234667 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 ./1:43:5:.,. 0/1:35:4 1/1:17:2 1/1:40:3 -20 1234767 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 0/1:35:4 0/1:17:2 0/0:40:3 -20 2230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 1|0:48:4:51,51 0/0:61:2 1/0:35:4 0/1:17:2 1/1:40:3 +20 1234667 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 ./1:43:5:.,. 0/1:0.0,1.0:35:4 1/1:1.0,1.0:17:2 1/1:1.0,1.0:40:3 +20 1234767 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:49:3:58,50 0|1:0.0,1.0:3:5:65,3 0/0:0.0,0.0:41:3 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 0/0:0.0,0.0:40:3 +20 2230237 . T . 47 PASS NS=3;DP=13;AA=T GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:54:7:56,60 1|0:1.0,0.0:48:4:51,51 0/0:0.0,0.0:61:2 1/0:1.0,0.0:35:4 0/1:0.0,1.0:17:2 1/1:1.0,1.0:40:3 20 2234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 2/1:35:4 1/2:17:2 1/0:40:3 -20 2234567 microsat1 GTC C 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/1:17:2 1/1:40:3 0/1:35:4 1/1:17:2 0/0:40:3 -20 2234667 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 1|0:48:8:51,51 ./1:43:5:.,. 0/1:35:4 0/1:17:2 1/1:40:3 -20 2234767 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 1|1:3:5:65,3 1/0:41:3 0/1:35:4 0/1:17:2 1/1:40:3 +20 2234567 microsat1 GTC C 50 PASS NS=3;DP=9;AA=G GT:HDS:GQ:DP 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 1/1:1.0,1.0:40:3 0/1:0.0,1.0:35:4 1/1:1.0,1.0:17:2 0/0:0.0,0.0:40:3 +20 2234667 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 ./1:43:5:.,. 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 1/1:1.0,1.0:40:3 +20 2234767 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:HDS:GQ:DP:HQ 0|0:0.0,0.0:49:3:58,50 1|1:1.0,1.0:3:5:65,3 1/0:1.0,0.0:41:3 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 1/1:1.0,1.0:40:3 20 3234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 2/1:35:4 1/2:17:2 1/0:40:3 -20 3234668 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 1|0:48:8:51,51 0/1:43:5:.,. 0/1:35:4 0/0:17:2 1/0:40:3 -20 3234679 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 1|0:48:8:51,51 1/0:43:5:.,. 0/1:35:4 0/1:17:2 0/1:40:3 -20 3234687 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 1|0:48:8:51,51 0/0:43:5:.,. 1/0:35:4 0/0:17:2 0/0:40:3 -20 3234697 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 1|0:48:8:51,51 1/0:43:5:.,. 0/0:35:4 0/0:17:2 0/0:40:3 -20 3234767 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 1|0:48:1:51,51 1|0:48:8:51,51 0/1:43:5:.,. 0/1:35:4 0/0:17:2 1/1:40:3 \ No newline at end of file +20 3234668 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 0/1:0.0,1.0:43:5:.,. 0/1:0.0,1.0:35:4 0/0:0.0,0.0:17:2 1/0:1.0,0.0:40:3 +20 3234679 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 1/0:1.0,0.0:43:5:.,. 0/1:0.0,1.0:35:4 0/1:0.0,1.0:17:2 0/1:0.0,1.0:40:3 +20 3234687 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 0/0:0.0,0.0:43:5:.,. 1/0:1.0,0.0:35:4 0/0:0.0,0.0:17:2 0/0:0.0,0.0:40:3 +20 3234697 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 1/0:1.0,0.0:43:5:.,. 0/0:0.0,0.0:35:4 0/0:0.0,0.0:17:2 0/0:0.0,0.0:40:3 +20 3234767 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:HDS:GQ:DP:HQ 1|0:1.0,0.0:48:1:51,51 1|0:1.0,0.0:48:8:51,51 0/1:0.0,1.0:43:5:.,. 0/1:0.0,1.0:35:4 0/0:0.0,0.0:17:2 1/1:1.0,1.0:40:3 \ No newline at end of file From 4912b8c56741dc97de1bbb70af6bc877cc0880b6 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Sun, 19 Nov 2017 18:26:03 -0500 Subject: [PATCH 25/86] Skips VCF records that don't have a requested format field. --- CMakeLists.txt | 4 +-- include/savvy/vcf_reader.hpp | 68 ++++++++++++++++++------------------ src/test.cpp | 24 ++++++------- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ec8426..2cf267c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,8 +55,8 @@ if(BUILD_TESTS) add_definitions(-DSAVVYT_VCF_FILE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file.vcf\" -DSAVVYT_SAV_FILE_HARD=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file_hard.sav\" -DSAVVYT_SAV_FILE_DOSE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file_dose.sav\" - -DSAVVYT_MARKER_COUNT=28 - -DSAVVYT_ALLELE_COUNT=40) + -DSAVVYT_MARKER_COUNT_HARD=28 + -DSAVVYT_MARKER_COUNT_DOSE=20) add_executable(savvy-test src/test.cpp src/test_class.cpp include/savvy/test_class.hpp) target_link_libraries(savvy-test savvy) diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index b9dff22..246963e 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -91,9 +91,6 @@ namespace savvy // std::vector::const_iterator prop_fields_begin() const { return property_fields_.begin(); } // std::vector::const_iterator prop_fields_end() const { return property_fields_.end(); } std::uint64_t sample_count() const; - - template - bool read_variant(site_info& annotations, T&... destinations); protected: virtual bcf_hdr_t* hts_hdr() const = 0; virtual bcf1_t* hts_rec() const = 0; @@ -108,11 +105,11 @@ namespace savvy void read_variant_details(site_info& destination); template - void read_requested_genos(T&... vec); + std::size_t read_requested_genos(T&... vec); template - void read_genos_to(fmt data_format, T1& vec); + bool read_genos_to(fmt data_format, T1& vec); template - void read_genos_to(fmt data_format, T1& vec, T2&... others); + bool read_genos_to(fmt data_format, T1& vec, T2&... others); template void read_genotypes_al(T& destination); @@ -419,19 +416,9 @@ namespace savvy template template - bool reader_base::read_variant(site_info& annotations, T&... destinations) - { - static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); - read_variant_details(annotations); - read_requested_genos(destinations...); - - return good(); - } - - template - template - void reader_base::read_requested_genos(T&... destinations) + std::size_t reader_base::read_requested_genos(T&... destinations) { + std::size_t cnt = 0; clear_destinations(destinations...); bcf_unpack(hts_rec(), BCF_UN_ALL); for (int i = 0; i < hts_rec()->n_fmt; ++i) @@ -447,44 +434,46 @@ namespace savvy { if (gt_idx < VecCnt) { - read_genos_to<0>(fmt::genotype, destinations...); + cnt += read_genos_to<0>(fmt::genotype, destinations...); } else // allele_idx < VecCnt { - read_genos_to<0>(fmt::allele, destinations...); + cnt += read_genos_to<0>(fmt::allele, destinations...); } } else if (fmt_key == "DS") { - read_genos_to<0>(fmt::dosage, destinations...); + cnt += read_genos_to<0>(fmt::dosage, destinations...); } else if (fmt_key == "HDS") { - read_genos_to<0>(fmt::haplotype_dosage, destinations...); + cnt += read_genos_to<0>(fmt::haplotype_dosage, destinations...); } else if (fmt_key == "GP") { - read_genos_to<0>(fmt::genotype_probability, destinations...); + cnt += read_genos_to<0>(fmt::genotype_probability, destinations...); } else if (fmt_key == "GL") { - read_genos_to<0>(fmt::genotype_likelihood, destinations...); + cnt += read_genos_to<0>(fmt::genotype_likelihood, destinations...); } else if (fmt_key == "PL") { - read_genos_to<0>(fmt::phred_scaled_genotype_likelihood, destinations...); + cnt += read_genos_to<0>(fmt::phred_scaled_genotype_likelihood, destinations...); } else { // Discard } } + return cnt; } template template - void reader_base::read_genos_to(fmt data_format, T1& destination) + bool reader_base::read_genos_to(fmt data_format, T1& destination) { + bool ret = true; if (requested_data_formats_[Idx] == data_format) { switch (data_format) @@ -515,20 +504,22 @@ namespace savvy else { // Discard Genotypes + ret = false; } + return ret; } template template - void reader_base::read_genos_to(fmt data_format, T1& vec, T2&... others) + bool reader_base::read_genos_to(fmt data_format, T1& vec, T2&... others) { if (requested_data_formats_[Idx] == data_format) { - read_genos_to(data_format, vec); + return read_genos_to(data_format, vec); } else { - read_genos_to(data_format, others...); + return read_genos_to(data_format, others...); } } @@ -1081,7 +1072,14 @@ namespace savvy template reader& reader::read(site_info& annotations, T&... destinations) { - this->read_variant(annotations, destinations...); + static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); + std::size_t vecs_read = 0; + while (vecs_read == 0) + { + this->read_variant_details(annotations); + vecs_read = this->read_requested_genos(destinations...); + } + return *this; } //################################################################// @@ -1104,13 +1102,14 @@ namespace savvy template indexed_reader& indexed_reader::read(site_info& annotations, T&... destinations) { + static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); while (this->good()) { this->read_variant_details(annotations); if (this->good() && region_compare(bounding_type_, annotations, region_)) { - this->read_requested_genos(destinations...); - break; + if (this->read_requested_genos(destinations...) > 0) + break; } } return *this; @@ -1120,6 +1119,7 @@ namespace savvy template indexed_reader& indexed_reader::read_if(Pred fn, site_info& annotations, T&... destinations) { + static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); while (this->good()) { this->read_variant_details(annotations); @@ -1127,8 +1127,8 @@ namespace savvy { if (fn(annotations) && region_compare(bounding_type_, annotations, region_)) { - this->read_requested_genos(destinations...); - break; + if (this->read_requested_genos(destinations...) > 0) + break; } } } diff --git a/src/test.cpp b/src/test.cpp index eb8abac..becbd27 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -540,7 +540,7 @@ class convert_file_test } assert(output.good()); - assert(cnt == SAVVYT_MARKER_COUNT); + assert(cnt == (Fmt == savvy::fmt::haplotype_dosage ? SAVVYT_MARKER_COUNT_DOSE : SAVVYT_MARKER_COUNT_HARD)); } if (Fmt != savvy::fmt::haplotype_dosage) @@ -680,7 +680,7 @@ void sav_random_access_test(savvy::fmt format) assert(!rdr.read(anno, buf)); } -void generic_reader_test(const std::string& path, savvy::fmt format) +void generic_reader_test(const std::string& path, savvy::fmt format, std::size_t expected_markers) { savvy::reader<1> rdr(path, format); assert(rdr.good()); @@ -692,15 +692,12 @@ void generic_reader_test(const std::string& path, savvy::fmt format) savvy::site_info i; std::vector d; std::size_t cnt{}; - std::size_t sum{}; while (rdr.read(i, d)) { assert(d.size() == intersect.size() * savvy::sample_stride(format, 2)); - sum += std::count_if(d.begin(), d.end(), [](float f) { return f > 0.0f; }); ++cnt; } - assert(cnt == SAVVYT_MARKER_COUNT); - assert(sum == SAVVYT_ALLELE_COUNT); + assert(cnt == expected_markers); } template @@ -721,7 +718,7 @@ void subset_test(const std::string& path) assert(d.size() == intersect.size() * savvy::sample_stride(F, 2)); ++cnt; } - assert(cnt == SAVVYT_MARKER_COUNT); + assert(cnt == (F == savvy::fmt::haplotype_dosage ? SAVVYT_MARKER_COUNT_DOSE : SAVVYT_MARKER_COUNT_HARD)); } int main(int argc, char** argv) @@ -750,11 +747,14 @@ int main(int argc, char** argv) if (!file_exists(SAVVYT_SAV_FILE_HARD)) convert_file_test()(); if (!file_exists(SAVVYT_SAV_FILE_DOSE)) convert_file_test()(); - for (auto path : {SAVVYT_SAV_FILE_HARD}) //, SAVVYT_SAV_FILE_DOSE}) - { - for (savvy::fmt format : {savvy::fmt::allele, savvy::fmt::haplotype_dosage}) - generic_reader_test(path, format); - } + generic_reader_test(SAVVYT_VCF_FILE, savvy::fmt::allele, SAVVYT_MARKER_COUNT_HARD); + generic_reader_test(SAVVYT_VCF_FILE, savvy::fmt::haplotype_dosage, SAVVYT_MARKER_COUNT_DOSE); + + generic_reader_test(SAVVYT_SAV_FILE_HARD, savvy::fmt::allele, SAVVYT_MARKER_COUNT_HARD); + generic_reader_test(SAVVYT_SAV_FILE_HARD, savvy::fmt::haplotype_dosage, SAVVYT_MARKER_COUNT_HARD); + + generic_reader_test(SAVVYT_SAV_FILE_DOSE, savvy::fmt::allele, SAVVYT_MARKER_COUNT_DOSE); + generic_reader_test(SAVVYT_SAV_FILE_DOSE, savvy::fmt::haplotype_dosage, SAVVYT_MARKER_COUNT_DOSE); } else if (cmd == "random-access") { From be18133ceaa76865e27ca61c057ecb17317e6b61 Mon Sep 17 00:00:00 2001 From: jonathonl Date: Sun, 19 Nov 2017 19:07:25 -0500 Subject: [PATCH 26/86] Starts sav executable. --- CMakeLists.txt | 3 + src/sav.cpp | 158 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 src/sav.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cf267c..9406fc4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,9 @@ add_library(savvy ${SOURCE_FILES} ${HEADER_FILES}) target_link_libraries(savvy ${HTS_LIBRARY} ${ZLIB_LIBRARY} ${ZSTD_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(savvy PUBLIC $ $) +add_executable(sav src/vcf2sav.cpp) +target_link_libraries(sav savvy) + add_executable(bcf2m3vcf src/bcf2m3vcf.cpp) target_link_libraries(bcf2m3vcf savvy) diff --git a/src/sav.cpp b/src/sav.cpp new file mode 100644 index 0000000..358a361 --- /dev/null +++ b/src/sav.cpp @@ -0,0 +1,158 @@ +#include +#include "savvy/vcf_reader.hpp" +#include "savvy/sav_reader.hpp" +#include "savvy/savvy.hpp" + +#include +#include + +#include +#include + +class prog_args +{ +private: + static const int default_compression_level = 3; + static const int default_block_size = 2048; + + std::vector