diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8ec0d89 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,27 @@ +language: cpp +compiler: gcc +dist: trusty +sudo: required +group: edge +branches: + only: + - develop +addons: + apt: + sources: + - george-edison55-precise-backports + - ubuntu-toolchain-r-test + packages: + - cmake-data + - cmake + - python-dev + - python-pip +install: + - sudo pip install cget + - cget install -f ./requirements.txt +script: + - cmake --version + - mkdir build && cd build + - cmake -DBUILD_TESTS=1 -DCMAKE_TOOLCHAIN_FILE=../cget/cget/cget.cmake .. + - make + - make CTEST_OUTPUT_ON_FAILURE=1 test \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c74a0e3..fddecde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.2) project(savvy VERSION 1.0.0) include(CMakePackageConfigHelpers) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) #add_library(hts STATIC IMPORTED) #set_property(TARGET hts PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/libhts.a) @@ -16,49 +16,67 @@ endif() #find_package(shrinkwrap CONFIG REQUIRED) find_library(HTS_LIBRARY hts) find_library(ZLIB_LIBRARY z) -#ZSTD can only likn statically since experimental functions are being used. -find_library(ZSTD_LIBRARY NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}) +find_library(ZSTD_LIBRARY zstd) find_package(Threads) #get_target_property(SHRINKWRAP_LIBS shrinkwrap INTERFACE_LINK_LIBRARIES) -add_definitions(-DSAVVY_VERSION="${PROJECT_VERSION}-alpha") +add_definitions(-DSAVVY_VERSION="${PROJECT_VERSION}-alpha2") set(HEADER_FILES include/savvy/savvy.hpp include/savvy/reader.hpp include/savvy/m3vcf_reader.hpp include/savvy/sav_reader.hpp include/savvy/allele_status.hpp include/savvy/vcf_reader.hpp include/savvy/varint.hpp include/savvy/s1r.hpp include/savvy/site_info.hpp include/savvy/variant_iterator.hpp include/savvy/portable_endian.hpp include/savvy/region.hpp include/savvy/compressed_vector.hpp include/savvy/eigen3_vector.hpp include/savvy/ublas_vector.hpp include/savvy/armadillo_vector.hpp include/savvy/utility.hpp include/savvy/data_format.hpp) set(SOURCE_FILES - src/savvy.cpp src/reader.cpp src/m3vcf_reader.cpp include/savvy/sav_reader.hpp src/sav_reader.cpp include/savvy/test_class.hpp src/vcf_reader.cpp src/varint.cpp src/utility.cpp src/site_info.cpp src/region.cpp) + src/savvy/savvy.cpp src/savvy/reader.cpp src/savvy/m3vcf_reader.cpp include/savvy/sav_reader.hpp src/savvy/sav_reader.cpp include/test/test_class.hpp src/savvy/vcf_reader.cpp src/savvy/varint.cpp src/savvy/utility.cpp src/savvy/site_info.cpp src/savvy/region.cpp) add_library(savvy ${SOURCE_FILES} ${HEADER_FILES}) target_link_libraries(savvy ${HTS_LIBRARY} ${ZLIB_LIBRARY} ${ZSTD_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(savvy PUBLIC $ $) -add_executable(savvy-test src/test.cpp src/test_class.cpp include/savvy/test_class.hpp) -target_link_libraries(savvy-test savvy) +add_executable(sav + src/sav/main.cpp + src/sav/export.cpp include/sav/export.hpp + src/sav/import.cpp include/sav/import.hpp + src/sav/index.cpp include/sav/index.hpp + src/sav/merge.cpp include/sav/merge.hpp + src/sav/sort.cpp include/sav/sort.hpp + src/sav/utility.cpp include/sav/utility.hpp) +target_link_libraries(sav savvy) -add_executable(bcf2m3vcf src/bcf2m3vcf.cpp) -target_link_libraries(bcf2m3vcf savvy) +#add_executable(bcf2m3vcf src/sav/bcf2m3vcf.cpp) +#target_link_libraries(bcf2m3vcf savvy) -add_executable(vcf2sav src/vcf2sav.cpp) -target_link_libraries(vcf2sav savvy) +#add_executable(sav-sample-sort src/sav/sav_sample_sort.cpp) +#target_link_libraries(sav-sample-sort savvy) -add_executable(sav2vcf src/sav2vcf.cpp) -target_link_libraries(sav2vcf savvy) +#add_executable(savvy-speed-test src/test/savvy_speed_test.cpp) +#target_link_libraries(savvy-speed-test savvy) -add_executable(sav-sample-sort src/sav_sample_sort.cpp) -target_link_libraries(sav-sample-sort savvy) -add_executable(savvy-speed-test src/savvy_speed_test.cpp) -target_link_libraries(savvy-speed-test savvy) +if(BUILD_TESTS) + enable_testing() + + add_definitions(-DSAVVYT_VCF_FILE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file.vcf\" + -DSAVVYT_SAV_FILE_HARD=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file_hard.sav\" + -DSAVVYT_SAV_FILE_DOSE=\"${CMAKE_CURRENT_SOURCE_DIR}/test_file_dose.sav\" + -DSAVVYT_MARKER_COUNT_HARD=28 + -DSAVVYT_MARKER_COUNT_DOSE=20) + + add_executable(savvy-test src/test/main.cpp src/test/test_class.cpp include/test/test_class.hpp) + target_link_libraries(savvy-test savvy) + + add_test(convert_file_test savvy-test convert-file) + add_test(subset_test savvy-test subset) + add_test(varint_test savvy-test varint) +endif() if(BUILD_SLR_EXAMPLES) - add_executable(slr-examples src/slr_examples.cpp) + add_executable(slr-examples src/test/slr_examples.cpp) target_link_libraries(slr-examples savvy armadillo) find_library(OPENCL_LIB OpenCL) - add_executable(linreg-ttest src/linreg_ttest.cpp) + add_executable(linreg-ttest src/test/linreg_ttest.cpp) target_link_libraries(linreg-ttest savvy ${OPENCL_LIB}) endif() @@ -67,6 +85,8 @@ install(TARGETS savvy EXPORT ${PROJECT_NAME}-config LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) +install(TARGETS sav RUNTIME DESTINATION bin) + install(EXPORT ${PROJECT_NAME}-config DESTINATION share/${PROJECT_NAME}) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake COMPATIBILITY SameMajorVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake DESTINATION share/${PROJECT_NAME}) diff --git a/LICENSE b/LICENSE index 65c5ca8..a612ad9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,165 +1,373 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 +Mozilla Public License Version 2.0 +================================== - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. +1. Definitions +-------------- +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. - 0. Additional Definitions. +1.3. "Contribution" + means Covered Software of a particular Contributor. - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/README.md b/README.md index 5b8c46f..6bd933a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# libsavvy +# Savvy Library Interface to various variant calling formats. ## Installing @@ -12,30 +12,18 @@ add_executable(prog main.cpp) target_link_libraries(prog savvy hts z zstd) ``` -## C++ 17 Class Template Argument Deduction -C++ 17 supports class template argument deduction, which means that template arguments can be deduced by constructor arguments. Compilers that do not support this must specify the number of data vectors to read as a template argument to the reader. +## Read Variants from File ```c++ -// With C++ 17 savvy::reader f("chr1.sav", savvy::fmt::allele); -savvy::reader f("chr1.sav", savvy::fmt::allele, savvy::fmt::genotype_likelilhoods); +savvy::variant> var; -// Without C++ 17 -savvy::reader<1> f("chr1.sav", savvy::fmt::allele); -savvy::reader<2> f("chr1.sav", savvy::fmt::allele, savvy::fmt::genotype_likelilhoods); -``` - -## Read Variants from File -```c++ -savvy::reader<1> f("chr1.sav", savvy::fmt::allele); -savvy::site_info anno; -std::vector alleles; -while (f.read(anno, alleles)) +while (f >> var) { - anno.locus(); - anno.chromosome(); - anno.ref(); - anno.alt(); - for (const float& allele : alleles) + var.locus(); + var.chromosome(); + var.ref(); + var.alt(); + for (const float& allele : var.data()) { ... } @@ -44,47 +32,54 @@ while (f.read(anno, alleles)) ## Indexed Files ```c++ -savvy::indexed_reader<1> f("chr1.sav", {"X", 100000, 199999}, savvy::fmt::allele); -savvy::site_info anno; -std::vector alleles; +savvy::indexed_reader f("chr1.sav", {"X", 100000, 199999}, savvy::fmt::allele); +savvy::variant> var; -while (f.read(anno, alleles)) +while (f >> v) { ... } f.reset_region({"X", 200000, 299999}); -while (f.read(anno, alleles)) +while (f >> v) { ... } ``` -## Multiple Data Vectors +## Read annotations and genotypes separately. ```c++ -savvy::reader<2> f("chr1.bcf", savvy::fmt::genotype, savvy::fmt::dosage); +savvy::reader f("chr1.sav", savvy::fmt::allele); savvy::site_info anno; -std::vector genotypes; -std::vector dosages; -while (f.read(anno, genotypes, dosages)) +std::vector alleles; + +while (f.read(anno, alleles)) { anno.locus(); anno.chromosome(); anno.ref(); anno.alt(); - - for (const float& gt : genotypes) - { - ... - } - - for (const float& ds : dosages) + for (const float& allele : alleles) { ... } } ``` +## Subsetting Samples +```c++ +savvy::reader f("chr1.sav", savvy::fmt::allele); +std::vector requested = {"ID001","ID002","ID003"}; +std::vector intersect = f.subset_samples({requested.begin(), requested.end()}); + +savvy::site_info anno; +std::vector alleles; +while (f.read(anno, alleles)) +{ + ... +} +``` + ## 3rd-party Vectors The reader classes utilize generic programming to efficiently support 3rd-party linear algebra libraries. ```c++ @@ -93,14 +88,16 @@ std::vector std_vector; savvy::compressed_vector savvy_sparse_vector; boost::numeric::ublas::compressed_vector ublas_sparse_vector; -savvy::reader<3> f("chr1.bcf", savvy::fmt::genotype_likelihoods, savvy::fmt::genotype, savvy::fmt::dosage); -f.read(anno, std_vector, savvy_sparse_vector, ublas_sparse_vector); +savvy::reader f("chr1.sav", savvy::fmt::dosage); +f.read(anno, std_vector); +f.read(anno, savvy_sparse_vector); +f.read(anno, ublas_sparse_vector); ``` ## Read Predicates Reading genotypes can be bypassed when using a read predicate. ```c++ -savvy::indexed_reader<1> f("chr1.sav", savvy::fmt::allele); +savvy::indexed_reader< f("chr1.sav", savvy::fmt::allele); savvy::site_info anno; savvy::compressed_vector gt; @@ -111,7 +108,7 @@ while (f.read_if([](const site_info& v) { return std::stof(v.prop("AF")) < 0.1; } ``` ```c++ -savvy::indexed_reader<1> f("chr1.sav", savvy::fmt::allele); +savvy::indexed_reader f("chr1.sav", savvy::fmt::allele); savvy::anno; std::vector buf; @@ -158,7 +155,7 @@ auto lin_reg = [](const std::vector& x, const std::vector& y) return std::make_tuple(m, b, r2); // slope, y-intercept, r-squared }; -savvy::reader<1> f("chr1.sav", savvy::fmt::genotype); +savvy::reader f("chr1.sav", savvy::fmt::genotype); savvy::site_info anno; std::vector geno; std::vector pheno(f.sample_size()); @@ -181,7 +178,7 @@ auto arma_lin_reg = [](const arma::Col& x, const arma::Col& y) return std::make_tuple(m, b, r2); // slope, y-intercept, r-squared }; -savvy::reader<1> f("chr1.sav", savvy::fmt::allele); +savvy::reader f("chr1.sav", savvy::fmt::allele); savvy::site_info anno; savvy::armadillo::dense_vector geno; arma::Col pheno(f.sample_size() * 2); @@ -191,4 +188,59 @@ while (f.read(anno, geno)) auto [ m, b, r2 ] = arma_lin_reg(geno, pheno); // ... } +``` + +## Multiple Data Vectors +```c++ +savvy::vcf::reader<2> f("chr1.bcf", savvy::fmt::genotype, savvy::fmt::dosage); +savvy::site_info anno; +std::vector genotypes; +std::vector dosages; +while (f.read(anno, genotypes, dosages)) +{ + anno.locus(); + anno.chromosome(); + anno.ref(); + anno.alt(); + + for (const float& gt : genotypes) + { + ... + } + + for (const float& ds : dosages) + { + ... + } +} +``` + +## C++ 17 Class Template Argument Deduction +C++ 17 supports class template argument deduction, which means that template arguments can be deduced by constructor arguments. Compilers that do not support this must specify the number of data vectors to read as a template argument to the reader. +```c++ +// With C++ 17 +savvy::vcf::reader f("chr1.bcf", savvy::fmt::allele); +savvy::vcf::reader f("chr1.bcf", savvy::fmt::allele, savvy::fmt::genotype_likelilhoods); + +// Without C++ 17 +savvy::vcf::reader<1> f("chr1.bcf", savvy::fmt::allele); +savvy::vcf::reader<2> f("chr1.bcf", savvy::fmt::allele, savvy::fmt::genotype_likelilhoods); +``` + +# SAV Command Line Interface +File manipulation for SAV format. + +## Import +```shell +sav import --sort --index file.bcf file.sav +``` + +## Merge +```shell +sav merge file1.sav file2.sav > merged.sav +``` + +## Export +```shell +sav export --regions chr1,chr2:10000-20000 --sample-ids ID1,ID2,ID3 file.sav > file.vcf ``` \ No newline at end of file diff --git a/dep/htslib.cmake b/dep/htslib.cmake index 2abcd70..8057ecb 100644 --- a/dep/htslib.cmake +++ b/dep/htslib.cmake @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.2) project(htslib VERSION 1.3.1) -execute_process(COMMAND ./configure --prefix=${CMAKE_INSTALL_PREFIX} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +execute_process(COMMAND ./configure --disable-libcurl --disable-lzma --disable-bz2 --prefix=${CMAKE_INSTALL_PREFIX} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_custom_target(hts ALL COMMAND make WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMENT "Builing htslib ...") @@ -10,8 +10,12 @@ install(DIRECTORY htslib DESTINATION include) if (BUILD_SHARED_LIBS) install(FILES ${CMAKE_SHARED_LIBRARY_PREFIX}hts${CMAKE_SHARED_LIBRARY_SUFFIX} - ${CMAKE_SHARED_LIBRARY_PREFIX}hts.1${CMAKE_SHARED_LIBRARY_SUFFIX} DESTINATION lib) + install(FILES + ${CMAKE_SHARED_LIBRARY_PREFIX}hts.1${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_SHARED_LIBRARY_PREFIX}hts${CMAKE_SHARED_LIBRARY_SUFFIX}.1 + DESTINATION lib + OPTIONAL) else() install(FILES ${CMAKE_STATIC_LIBRARY_PREFIX}hts${CMAKE_STATIC_LIBRARY_SUFFIX} diff --git a/include/sav/export.hpp b/include/sav/export.hpp new file mode 100644 index 0000000..5168e8e --- /dev/null +++ b/include/sav/export.hpp @@ -0,0 +1,12 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef SAVVY_SAV_EXPORT_HPP +#define SAVVY_SAV_EXPORT_HPP + +int export_main(int argc, char** argv); + +#endif //SAVVY_SAV_EXPORT_HPP \ No newline at end of file diff --git a/include/sav/import.hpp b/include/sav/import.hpp new file mode 100644 index 0000000..c4acbf5 --- /dev/null +++ b/include/sav/import.hpp @@ -0,0 +1,12 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef SAVVY_SAV_IMPORT_HPP +#define SAVVY_SAV_IMPORT_HPP + +int import_main(int argc, char** argv); + +#endif //SAVVY_SAV_IMPORT_HPP \ No newline at end of file diff --git a/include/sav/index.hpp b/include/sav/index.hpp new file mode 100644 index 0000000..e5f4c78 --- /dev/null +++ b/include/sav/index.hpp @@ -0,0 +1,12 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef SAVVY_SAV_INDEX_HPP +#define SAVVY_SAV_INDEX_HPP + +int index_main(int argc, char** argv); + +#endif //SAVVY_SAV_INDEX_HPP \ No newline at end of file diff --git a/include/sav/merge.hpp b/include/sav/merge.hpp new file mode 100644 index 0000000..b75ac9a --- /dev/null +++ b/include/sav/merge.hpp @@ -0,0 +1,12 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef SAVVY_SAV_MERGE_HPP +#define SAVVY_SAV_MERGE_HPP + +int merge_main(int argc, char** argv); + +#endif //SAVVY_SAV_MERGE_HPP \ No newline at end of file diff --git a/include/sav/sort.hpp b/include/sav/sort.hpp new file mode 100644 index 0000000..96e574b --- /dev/null +++ b/include/sav/sort.hpp @@ -0,0 +1,178 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef SAVVY_SAV_SORT_HPP +#define SAVVY_SAV_SORT_HPP + +#include "savvy/sav_reader.hpp" + +#include +#include + +class less_than_comparator +{ +public: + less_than_comparator(savvy::s1r::sort_type type); + bool operator()(const savvy::site_info& a, const savvy::site_info& b); +private: + bool left(const savvy::site_info& a, const savvy::site_info& b); + bool right(const savvy::site_info& a, const savvy::site_info& b); + bool mid(const savvy::site_info& a, const savvy::site_info& b); +private: + savvy::s1r::sort_type sort_type_; +}; + +class random_string_generator +{ +public: + random_string_generator(); + std::string operator()(std::size_t length); +private: + std::vector char_array_ = {'0','1','2','3','4','5','6','7','8','9', + 'A','B','C','D','E','F','G','H','I','J', + 'K','L','M','N','O','P','Q','R','S','T', + 'U','V','W','X','Y','Z'}; + std::mt19937 rg_; + std::uniform_int_distribution dist_; +}; + +class headless_sav_writer : public savvy::sav::writer +{ +public: + template + headless_sav_writer(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, savvy::fmt data_format) + : + savvy::sav::writer("", samples_beg, samples_end, headers_beg, headers_end, data_format) + { + output_buf_ = std::unique_ptr(new shrinkwrap::zstd::obuf(file_path)); + output_stream_.rdbuf(output_buf_.get()); + file_path_ = file_path; + } + + const std::string& file_path() const { return this->file_path_; } +}; + +class headless_sav_reader : public savvy::sav::reader +{ +public: + template + headless_sav_reader(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, savvy::fmt format) : + savvy::sav::reader("", format) + { + file_data_format_ = format; + file_path_ = file_path; + input_stream_ = savvy::detail::make_unique(file_path); + for (auto it = headers_beg; it != headers_end; ++it) + { + if (it->first == "INFO") + { + std::string info_field = savvy::parse_header_id(it->second); + metadata_fields_.push_back(std::move(info_field)); + } + else if (it->first == "FORMAT") + { + std::string format_field = savvy::parse_header_id(it->second); + if (format_field == "GT") + file_data_format_ = savvy::fmt::allele; +// else if (format_field == "GP") +// file_data_format_ = fmt::genotype_probability; + else if (format_field == "HDS") + file_data_format_ = savvy::fmt::haplotype_dosage; + } + headers_.emplace_back(it->first, it->second); + } + + this->sample_ids_.assign(samples_beg, samples_end); + + } +}; + +template +bool sort_and_write_records(savvy::s1r::sort_type sort, Reader& in, savvy::fmt in_format, const std::vector& regions, Writer& out, savvy::fmt out_format) +{ + less_than_comparator less_than(sort); + + random_string_generator str_gen; + std::size_t temp_file_size = 7; + std::deque temp_writers; + std::deque temp_readers; + + + std::vector> in_mem_variants(temp_file_size); + std::vector>> in_mem_variant_refs(in_mem_variants.begin(), in_mem_variants.end()); + + std::size_t read_counter = temp_file_size; + while (read_counter == temp_file_size) + { + read_counter = 0; + for (; read_counter < temp_file_size; ++read_counter) + { + in >> in_mem_variant_refs[read_counter].get(); + if (!in.good()) + break; + } + + if (read_counter) + { + std::sort(in_mem_variant_refs.begin(), in_mem_variant_refs.begin() + read_counter, less_than); + + std::string temp_path = "/tmp/tmp-" + str_gen(8) + ".sav"; + temp_writers.emplace_back(temp_path, in.samples().begin(), in.samples().end(), in.headers().begin(), in.headers().end(), in_format); + temp_readers.emplace_back(temp_path, in.samples().begin(), in.samples().end(), in.headers().begin(), in.headers().end(), out_format); + std::remove(temp_path.c_str()); + for (std::size_t i = 0; i> write_variants(temp_readers.size()); + std::size_t i = 0; + for (auto it = temp_readers.begin(); it != temp_readers.end(); ++it) + { + it->read(write_variants[i], write_variants[i].data()); + ++i; + } + + + std::size_t min_index; + + do + { + min_index = write_variants.size(); + for (std::size_t rdr_index = 0; rdr_index < write_variants.size(); ++rdr_index) + { + if (temp_readers[rdr_index].good()) + { + if (min_index < write_variants.size()) + { + if (less_than(write_variants[rdr_index], write_variants[min_index])) + min_index = rdr_index; + } + else + { + min_index = rdr_index; + } + } + } + + if (min_index < write_variants.size()) + { + out << write_variants[min_index]; + temp_readers[min_index] >> write_variants[min_index]; + } + + } while (min_index < write_variants.size()); + + return false; +} + +#endif //SAVVY_SAV_SORT_HPP diff --git a/include/sav/utility.hpp b/include/sav/utility.hpp new file mode 100644 index 0000000..a2b998c --- /dev/null +++ b/include/sav/utility.hpp @@ -0,0 +1,21 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef SAVVY_SAV_UTILITY_HPP +#define SAVVY_SAV_UTILITY_HPP + +#include "savvy/region.hpp" + +#include +#include +#include + +savvy::region string_to_region(const std::string& s); +std::vector split_string_to_vector(const char* in, char delim); +std::set split_string_to_set(const char* in, char delim); +std::set split_file_to_set(const char* in); + +#endif //SAVVY_SAV_UTILITY_HPP diff --git a/include/savvy/allele_status.hpp b/include/savvy/allele_status.hpp index bd38094..60d077f 100644 --- a/include/savvy/allele_status.hpp +++ b/include/savvy/allele_status.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_ALLELE_STATUS_HPP #define LIBSAVVY_ALLELE_STATUS_HPP diff --git a/include/savvy/armadillo_vector.hpp b/include/savvy/armadillo_vector.hpp index 0c68626..ef1df0e 100644 --- a/include/savvy/armadillo_vector.hpp +++ b/include/savvy/armadillo_vector.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_ARMADILLO_VECTOR_HPP #define LIBSAVVY_ARMADILLO_VECTOR_HPP diff --git a/include/savvy/compressed_vector.hpp b/include/savvy/compressed_vector.hpp index 1baaa42..5685bc1 100644 --- a/include/savvy/compressed_vector.hpp +++ b/include/savvy/compressed_vector.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef LIBSAVVY_SPARSE_VECTOR_HPP #define LIBSAVVY_SPARSE_VECTOR_HPP @@ -15,6 +20,90 @@ namespace savvy typedef compressed_vector self_type; static const T const_value_type; + class iterator + { + public: + typedef iterator self_type; + typedef std::ptrdiff_t difference_type; + typedef T value_type; + typedef value_type& reference; + typedef value_type* pointer; + typedef std::input_iterator_tag iterator_category; + + iterator() : vec_(nullptr), beg_(nullptr), cur_(beg_) {} + iterator(compressed_vector& file_reader, std::size_t off) : + vec_(&file_reader), + beg_(file_reader.values_.data()), + cur_(file_reader.values_.data() + off) + { + + } + + std::size_t offset() const + { + return vec_->offsets_[cur_ - beg_]; + } + + self_type operator++() + { + self_type ret = *this; + ++cur_; + return ret; + } + + void operator++(int) { ++cur_; } + reference operator*() { return *cur_; } + pointer operator->() { return cur_; } + bool operator==(const self_type& rhs) const { return (cur_ == rhs.cur_); } + bool operator!=(const self_type& rhs) const { return (cur_ != rhs.cur_); } + private: + compressed_vector* vec_; + const value_type*const beg_; + value_type* cur_; + }; + + class const_iterator + { + public: + typedef const_iterator self_type; + typedef std::ptrdiff_t difference_type; + typedef T value_type; + typedef const value_type& reference; + typedef const value_type* pointer; + typedef std::input_iterator_tag iterator_category; + + const_iterator() : vec_(nullptr), beg_(nullptr), cur_(beg_) {} + const_iterator(const compressed_vector& file_reader, std::size_t off) : + vec_(&file_reader), + beg_(file_reader.values_.data()), + cur_(beg_ + off) + { + + } + + std::size_t offset() const + { + return vec_->offsets_[cur_ - beg_]; + } + + self_type operator++() + { + self_type ret = *this; + ++cur_; + return ret; + } + + void operator++(int) { ++cur_; } + reference operator*() const { return *cur_; } + const pointer operator->() const { return cur_; } + bool operator==(const self_type& rhs) const { return (cur_ == rhs.cur_); } + bool operator!=(const self_type& rhs) const { return (cur_ != rhs.cur_); } + private: + const compressed_vector* vec_; + const value_type*const beg_; + const value_type* cur_; + }; + compressed_vector() : size_(0) { @@ -40,11 +129,14 @@ namespace savvy } } - auto begin() const { return this->values_.cbegin(); } - auto end() const { return this->values_.cend(); } + const_iterator cbegin() const { return const_iterator(*this, 0); } + const_iterator cend() const { return const_iterator(*this, this->values_.size()); } + + const_iterator begin() const { return this->cbegin(); } + const_iterator end() const { return this->cend(); } - auto begin() { return this->values_.begin(); } - auto end() { return this->values_.end(); } + iterator begin() { return iterator(*this, 0); } + iterator end() { return iterator(*this, this->values_.size()); } const value_type& operator[](std::size_t pos) const { diff --git a/include/savvy/data_format.hpp b/include/savvy/data_format.hpp index 7139c7a..8e475da 100644 --- a/include/savvy/data_format.hpp +++ b/include/savvy/data_format.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef LIBSAVVY_DATA_FORMAT_HPP #define LIBSAVVY_DATA_FORMAT_HPP @@ -14,6 +19,7 @@ namespace savvy genotype_likelihood, phred_scaled_genotype_likelihood, dosage, + haplotype_dosage, //phase, // gt = genotype, // gp = genotype_probability, @@ -22,5 +28,19 @@ namespace savvy // ds = dosage // ec = dosage }; + + inline std::uint64_t sample_stride(fmt format, std::uint64_t ploidy) + { + switch (format) + { + case fmt::allele: return ploidy; + case fmt::genotype: return 1; + case fmt::genotype_probability: return ploidy + 1; + case fmt::genotype_likelihood: return ploidy + 1; + case fmt::phred_scaled_genotype_likelihood: return ploidy + 1; + case fmt::dosage: return 1; + case fmt::haplotype_dosage: return ploidy; + } + } } #endif //LIBSAVVY_DATA_FORMAT_HPP diff --git a/include/savvy/eigen3_vector.hpp b/include/savvy/eigen3_vector.hpp index b5572fe..45feb11 100644 --- a/include/savvy/eigen3_vector.hpp +++ b/include/savvy/eigen3_vector.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef LIBSAVVY_EIGEN3_VECTOR_HPP #define LIBSAVVY_EIGEN3_VECTOR_HPP diff --git a/include/savvy/m3vcf_reader.hpp b/include/savvy/m3vcf_reader.hpp index 04b5727..37dbf5a 100644 --- a/include/savvy/m3vcf_reader.hpp +++ b/include/savvy/m3vcf_reader.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_M3VCF_READER_HPP #define LIBSAVVY_M3VCF_READER_HPP diff --git a/include/savvy/portable_endian.hpp b/include/savvy/portable_endian.hpp index 0dbf558..359f500 100644 --- a/include/savvy/portable_endian.hpp +++ b/include/savvy/portable_endian.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef PORTABLE_ENDIAN_H__ #define PORTABLE_ENDIAN_H__ diff --git a/include/savvy/reader.hpp b/include/savvy/reader.hpp index 2914aee..6189d4d 100644 --- a/include/savvy/reader.hpp +++ b/include/savvy/reader.hpp @@ -1,5 +1,11 @@ -#ifndef VC_READER_HPP -#define VC_READER_HPP +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef LIBSAVVY_READER_HPP +#define LIBSAVVY_READER_HPP #include "site_info.hpp" #include "sav_reader.hpp" @@ -13,7 +19,6 @@ namespace savvy { //################################################################// - template class reader_base { public: @@ -142,246 +147,147 @@ namespace savvy // return good(); // } - std::vector prop_fields() const; - sample_iterator samples_begin() const; - sample_iterator samples_end() const; - std::size_t sample_size() const; + const std::vector& info_fields() const; + const std::vector& samples() const; + const std::vector>& headers() const; + + std::vector subset_samples(const std::set& subset); + private: + static const std::vector empty_string_vector; + static const std::vector> empty_string_pair_vector; protected: - virtual savvy::sav::reader_base* sav_impl() const = 0; - virtual savvy::vcf::reader_base* vcf_impl() const = 0; + virtual savvy::sav::reader_base* sav_impl() const = 0; + virtual savvy::vcf::reader_base<1>* vcf_impl() const = 0; }; //################################################################// //################################################################// - - template - class reader : public reader_base + class reader : public reader_base { public: reader() {} - template - reader(const std::string& file_path, T... data_formats); + template + reader(const std::string& file_path, T data_format); ~reader() {} - template - reader& operator>>(std::tuple& destination); + template + reader& operator>>(variant& destination); - template - reader& read(site_info& annotations, T&... destinations); + template + reader& read(site_info& annotations, T& destination); private: - savvy::sav::reader_base* sav_impl() const { return sav_reader_.get(); } - savvy::vcf::reader_base* vcf_impl() const { return vcf_reader_.get(); } + savvy::sav::reader_base* sav_impl() const { return sav_reader_.get(); } + savvy::vcf::reader_base<1>* vcf_impl() const { return vcf_reader_.get(); } private: - std::unique_ptr> sav_reader_; - std::unique_ptr> vcf_reader_; + std::unique_ptr sav_reader_; + std::unique_ptr> vcf_reader_; }; - -#if __cpp_deduction_guides >= 201606 - template - reader(const std::string& file_path, T... data_formats) -> reader; -#endif //################################################################// //################################################################// - template - class indexed_reader : public reader_base + class indexed_reader : public reader_base { public: indexed_reader() {} - template - indexed_reader(const std::string& file_path, const region& reg, T... data_formats); - template - indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, T... data_formats); + template + indexed_reader(const std::string& file_path, const region& reg, T data_format); + template + indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, T data_format); void reset_region(const region& reg); std::vector chromosomes() const; - template - indexed_reader& operator>>(std::tuple& destination); + template + indexed_reader& operator>>(variant& destination); - template - indexed_reader& read(site_info& annotations, T&... destinations); + template + indexed_reader& read(site_info& annotations, T& destination); - template - indexed_reader& read_if(Pred fn, site_info& annotations, T&... destinations); + template + indexed_reader& read_if(Pred fn, site_info& annotations, T& destination); private: - savvy::sav::reader_base* sav_impl() const { return sav_reader_.get(); } - savvy::vcf::reader_base* vcf_impl() const { return vcf_reader_.get(); } + savvy::sav::reader_base* sav_impl() const { return sav_reader_.get(); } + savvy::vcf::reader_base<1>* vcf_impl() const { return vcf_reader_.get(); } private: - std::unique_ptr> sav_reader_; - std::unique_ptr> vcf_reader_; + std::unique_ptr sav_reader_; + std::unique_ptr> vcf_reader_; }; - -#if __cpp_deduction_guides >= 201606 - template - indexed_reader(const std::string& file_path, const region& reg, T... data_formats) -> indexed_reader; - - template - indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, T... data_formats) -> indexed_reader; -#endif //################################################################// - - - - - - - //################################################################// - template - std::vector reader_base::prop_fields() const - { - if (sav_impl()) - return sav_impl()->prop_fields(); - else if (vcf_impl()) - return vcf_impl()->prop_fields(); - return {}; - } - - template - typename reader_base::sample_iterator reader_base::samples_begin() const - { - reader_base::sample_iterator ret; - if (sav_impl()) - ret = reader_base::sample_iterator(sav_impl()->samples_begin()); - else if (vcf_impl()) - ret = reader_base::sample_iterator(vcf_impl()->samples_begin()); - return ret; - } - - template - typename reader_base::sample_iterator reader_base::samples_end() const - { - reader_base::sample_iterator ret; - if (sav_impl()) - ret = reader_base::sample_iterator(sav_impl()->samples_end()); - else if (vcf_impl()) - ret = reader_base::sample_iterator(vcf_impl()->samples_end()); - return ret; - } - - template - std::size_t reader_base::sample_size() const - { - std::size_t ret{}; - if (sav_impl()) - ret = static_cast(sav_impl()->samples_end() - sav_impl()->samples_begin()); - else if (vcf_impl()) - ret = static_cast(vcf_impl()->samples_end() - vcf_impl()->samples_begin()); - return ret; - } - //################################################################// - //################################################################// - template - template - reader& reader::operator>>(std::tuple& destination) + template + reader& reader::operator>>(variant& destination) { - ::savvy::detail::apply([this](site_info& anno, auto&... args) - { - this->read(anno, std::forward(args)...); - }, - destination); - return *this; + return this->read(destination, destination.data()); } - template - template - reader& reader::read(site_info& annotations, T&... destinations) + template + reader& reader::read(site_info& annotations, T& destination) { if (sav_impl()) - sav_reader_->read(annotations, destinations...); + sav_reader_->read(annotations, destination); else if (vcf_impl()) - vcf_reader_->read(annotations, destinations...); + vcf_reader_->read(annotations, destination); return *this; } - template - template - reader::reader(const std::string& file_path, T... data_formats) + template + reader::reader(const std::string& file_path, T data_format) { if (::savvy::detail::has_extension(file_path, ".sav")) - sav_reader_ = ::savvy::detail::make_unique>(file_path, data_formats...); + sav_reader_ = ::savvy::detail::make_unique(file_path, data_format); else if (::savvy::detail::has_extension(file_path, ".vcf") || ::savvy::detail::has_extension(file_path, ".vcf.gz") || ::savvy::detail::has_extension(file_path, ".bcf")) - vcf_reader_ = detail::make_unique>(file_path, data_formats...); + vcf_reader_ = detail::make_unique>(file_path, data_format); } //################################################################// //################################################################// - template - std::vector indexed_reader::chromosomes() const - { - if (sav_reader_) - return sav_reader_->chromosomes(); - else if (vcf_reader_) - return vcf_reader_->chromosomes(); - return {}; - } - - template - template - indexed_reader::indexed_reader(const std::string& file_path, const region& reg, T... data_formats) + template + indexed_reader::indexed_reader(const std::string& file_path, const region& reg, T data_format) { if (::savvy::detail::has_extension(file_path, ".sav")) - sav_reader_ = ::savvy::detail::make_unique>(file_path, reg, data_formats...); + sav_reader_ = ::savvy::detail::make_unique(file_path, reg, data_format); else if (::savvy::detail::has_extension(file_path, ".vcf") || ::savvy::detail::has_extension(file_path, ".vcf.gz") || ::savvy::detail::has_extension(file_path, ".bcf")) - vcf_reader_ = ::savvy::detail::make_unique>(file_path, reg, data_formats...); + vcf_reader_ = ::savvy::detail::make_unique>(file_path, reg, data_format); } - template - template - indexed_reader::indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, T... data_formats) + template + indexed_reader::indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, T data_format) { if (::savvy::detail::has_extension(file_path, ".sav")) - sav_reader_ = ::savvy::detail::make_unique>(file_path, reg, bounding_type, data_formats...); + sav_reader_ = ::savvy::detail::make_unique(file_path, reg, bounding_type, data_format); else if (::savvy::detail::has_extension(file_path, ".vcf") || ::savvy::detail::has_extension(file_path, ".vcf.gz") || ::savvy::detail::has_extension(file_path, ".bcf")) - vcf_reader_ = ::savvy::detail::make_unique>(file_path, reg, bounding_type, data_formats...); - } - - template - void indexed_reader::reset_region(const region& reg) - { - if (sav_reader_) - sav_reader_->reset_region(reg); - else if (vcf_reader_) - vcf_reader_->reset_region(reg); + vcf_reader_ = ::savvy::detail::make_unique>(file_path, reg, bounding_type, data_format); } - template - template - indexed_reader& indexed_reader::operator>>(std::tuple& destination) + template + indexed_reader& indexed_reader::operator>>(variant& destination) { - ::savvy::detail::apply([this](site_info& anno, auto&... args) - { - this->read(anno, std::forward(args)...); - }, - destination); - return *this; + return this->read(destination, destination.data()); } - template - template - indexed_reader& indexed_reader::read(site_info& annotations, T&... destinations) + template + indexed_reader& indexed_reader::read(site_info& annotations, T& destination) { if (sav_impl()) - sav_reader_->read(annotations, destinations...); + sav_reader_->read(annotations, destination); else if (vcf_impl()) - vcf_reader_->read(annotations, destinations...); + vcf_reader_->read(annotations, destination); return *this; } - template - template - indexed_reader& indexed_reader::read_if(Pred fn, site_info& annoations, T&... destinations) + template + indexed_reader& indexed_reader::read_if(Pred fn, site_info& annoations, T& destination) { if (sav_reader_) - sav_reader_->read_if(fn, annoations, destinations... ); + sav_reader_->read_if(fn, annoations, destination); else if (vcf_reader_) - vcf_reader_->read_if(fn, annoations, destinations...); + vcf_reader_->read_if(fn, annoations, destination); return *this; } + //################################################################// } #endif //VC_READER_HPP \ No newline at end of file diff --git a/include/savvy/region.hpp b/include/savvy/region.hpp index ec7a432..1aa6c1e 100644 --- a/include/savvy/region.hpp +++ b/include/savvy/region.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef LIBSAVVY_REGION_HPP #define LIBSAVVY_REGION_HPP @@ -42,7 +47,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - return (var.locus() <= reg.to() && (var.locus() + std::max(var.ref().size(), var.alt().size()) - 1) >= reg.from() && var.chromosome() == reg.chromosome()); + return (var.position() <= reg.to() && (var.position() + std::max(var.ref().size(), var.alt().size()) - 1) >= reg.from() && var.chromosome() == reg.chromosome()); } }; @@ -50,7 +55,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - return (var.locus() >= reg.from() && (var.locus() + std::max(var.ref().size(), var.alt().size()) - 1) <= reg.to() && var.chromosome() == reg.chromosome()); + return (var.position() >= reg.from() && (var.position() + std::max(var.ref().size(), var.alt().size()) - 1) <= reg.to() && var.chromosome() == reg.chromosome()); } }; @@ -58,7 +63,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - return (var.locus() >= reg.from() && var.locus() <= reg.to() && var.chromosome() == reg.chromosome()); + return (var.position() >= reg.from() && var.position() <= reg.to() && var.chromosome() == reg.chromosome()); } }; @@ -66,7 +71,7 @@ namespace savvy { static bool compare(const site_info& var, const region& reg) { - std::uint64_t right = (var.locus() + std::max(var.ref().size(), var.alt().size()) - 1); + std::uint64_t right = (var.position() + std::max(var.ref().size(), var.alt().size()) - 1); return (right >= reg.from() && right <= reg.to() && var.chromosome() == reg.chromosome()); } }; diff --git a/include/savvy/s1r.hpp b/include/savvy/s1r.hpp index 6abfff1..b57af81 100644 --- a/include/savvy/s1r.hpp +++ b/include/savvy/s1r.hpp @@ -1,22 +1,76 @@ -#ifndef LIBSAVVY_CMF_INDEX_READER_HPP -#define LIBSAVVY_CMF_INDEX_READER_HPP +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef LIBSAVVY_S1R_INDEX_READER_HPP +#define LIBSAVVY_S1R_INDEX_READER_HPP #include "portable_endian.hpp" #include "region.hpp" +#include #include -#include +#include #include #include #include #include -#include +#include #include +#include +#include +#include namespace savvy { namespace s1r { + class internal_entry + { + public: + std::uint32_t region_start() const { return be32toh(region_start_); } + std::uint32_t region_length() const { return be32toh(region_length_); } + std::uint32_t region_end() const { return this->region_start() + this->region_length(); } + + internal_entry() : + region_start_(0), + region_length_(0) + { } + + internal_entry(std::uint32_t beg, std::uint32_t end) : + region_start_(htobe32(beg)), + region_length_(end > beg ? htobe32(end - beg) : 0) + { } + private: + std::uint32_t region_start_; + std::uint32_t region_length_; + }; + + static_assert(sizeof(internal_entry) == 8, "Alignment issue (internal_entry)"); + + class entry : public internal_entry + { + public: + std::uint64_t value() const + { + return be64toh(value_); + } + + entry() : + value_(0) + { } + + entry(std::uint32_t beg, std::uint32_t end, std::uint64_t value) : + internal_entry(beg, end), + value_(htobe64(value)) + { } + private: + std::uint64_t value_; + }; + static_assert(sizeof(entry) == 16, "Alignment issue (entry)"); + namespace detail { inline std::uint64_t ceil_divide(std::uint64_t x, std::uint64_t y) @@ -31,58 +85,18 @@ namespace savvy inline std::size_t entries_per_leaf_node(std::size_t block_size) { - return block_size / std::size_t(16); + return std::uint16_t(block_size / sizeof(entry)); } inline std::size_t entries_per_internal_node(std::size_t block_size) { - return block_size / std::uint16_t(8); + return std::uint16_t(block_size / sizeof(internal_entry)); } } class tree_base { public: - - class internal_entry - { - public: - std::uint64_t region_start() const { return be64toh(region_start_); } - std::uint64_t region_length() const { return be64toh(region_length_); } - std::uint64_t region_end() const { return this->region_start() + this->region_length(); } - internal_entry() : - region_start_(0), - region_length_(0) - { } - internal_entry(std::uint64_t beg, std::uint64_t end) : - region_start_(htobe64(beg)), - region_length_(end > beg ? htobe64(end - beg) : 0) - { } - private: - std::uint64_t region_start_; - std::uint64_t region_length_; - }; - - class entry : public internal_entry - { - public: - std::uint64_t value() const - { - return be64toh(value_); - } - - entry() : - value_(0) - { } - - entry(std::uint64_t beg, std::uint64_t end, std::uint64_t value) : - internal_entry(beg, end), - value_(htobe64(value)) - { } - private: - std::uint64_t value_; - }; - struct node_position { node_position(std::uint64_t lev, std::uint64_t node_off) : level(lev), node_offset(node_off) {} @@ -99,12 +113,32 @@ namespace savvy std::uint64_t entry_offset; }; - std::uint64_t header_block_count() const { return root_block_offset_; } + tree_base(std::uint8_t block_size_in_kib, std::uint64_t block_offset, std::uint64_t entry_count) + { + //const std::uint16_t header_data_size = 7 + 2 + 8; + block_size_ = 1024u * (std::uint32_t(block_size_in_kib) + 1); + block_offset_ = block_offset; //ceil_divide(header_data_size, block_size_); + entry_count_ = entry_count; + + std::size_t node_count = 1; + entry_counts_per_level_.clear(); + this->entry_counts_per_level_.insert(this->entry_counts_per_level_.begin(), entry_count_); + for (std::uint64_t nodes_at_current_level = detail::ceil_divide(entry_count_, (std::uint64_t) this->entries_per_leaf_node()); + nodes_at_current_level > 1; + nodes_at_current_level = detail::ceil_divide(nodes_at_current_level, (std::uint64_t) this->entries_per_internal_node())) + { + node_count += nodes_at_current_level; + this->entry_counts_per_level_.insert(this->entry_counts_per_level_.begin(), nodes_at_current_level); + } + + root_block_end_pos_ = (block_offset + node_count) * block_size_; + } + std::uint64_t entry_count() const { return entry_count_; } std::uint64_t bucket_size() const { return block_size_; } std::uint64_t entry_count_at_level(std::size_t level) const { return entry_counts_per_level_[level]; } - std::uint16_t entries_per_leaf_node() const { return block_size_ / std::uint16_t(16); } - std::uint16_t entries_per_internal_node() const { return block_size_ / std::uint16_t(8); } + std::uint16_t entries_per_leaf_node() const { return std::uint16_t(block_size_ / sizeof(entry)); } + std::uint16_t entries_per_internal_node() const { return std::uint16_t(block_size_ / sizeof(internal_entry)); } std::uint64_t calculate_node_size(node_position input) const { @@ -123,39 +157,39 @@ namespace savvy return ret; } - tree_position calculate_parent_position(node_position input) const + tree_position calculate_parent_position(const node_position& input) const { return tree_position(input.level - 1, input.node_offset / this->entries_per_internal_node(), input.node_offset % this->entries_per_internal_node()); } - node_position calculate_child_position(tree_position input) const + node_position calculate_child_position(const tree_position& input) const { return node_position(input.level + 1, this->entries_per_internal_node() * input.node_offset + input.entry_offset); } - std::streampos calculate_file_position(node_position input) const + std::streampos calculate_file_position(const node_position& input) const { - std::uint64_t ret = root_block_offset_ * block_size_; + std::streampos ret = root_block_end_pos_ - std::streampos(block_size_); if (input.level > 0) { - ret += block_size_; // for root node - for (auto it = entry_counts_per_level_.begin(); it != entry_counts_per_level_.end(); ++it) { if (1 + std::distance(entry_counts_per_level_.begin(), it) < input.level) { - ret += *it * block_size_; + ret -= (*it * block_size_); } else { - ret += input.node_offset * block_size_; + ret -= (*it * block_size_ - input.node_offset * block_size_); break; } } } - return std::streampos(ret); + assert(ret >= std::streampos(block_offset_)); + + return ret; } @@ -164,41 +198,13 @@ namespace savvy return entry_counts_per_level_.size(); } - const std::string& name() const { return name_; } - protected: - std::string name_; - std::uint64_t root_block_offset_; + private: + std::streampos root_block_end_pos_; + std::uint64_t block_offset_; std::uint32_t block_size_; std::vector entry_counts_per_level_; std::uint64_t entry_count_; - - void init(std::uint8_t block_size_in_kib, std::uint64_t block_offset, const std::string& name) - { - //const std::uint16_t header_data_size = 7 + 2 + 8; - block_size_ = 1024u * (std::uint32_t(block_size_in_kib) + 1); - root_block_offset_ = block_offset; //ceil_divide(header_data_size, block_size_); - name_ = name; - - entry_counts_per_level_.clear(); - this->entry_counts_per_level_.insert(this->entry_counts_per_level_.begin(), entry_count_); - for (std::uint64_t nodes_at_current_level = ceil_divide(entry_count_, (std::uint64_t) this->entries_per_leaf_node()); - nodes_at_current_level > 1; - nodes_at_current_level = ceil_divide(nodes_at_current_level, (std::uint64_t) this->entries_per_internal_node())) - { - this->entry_counts_per_level_.insert(this->entry_counts_per_level_.begin(), nodes_at_current_level); - } - } - - static std::uint64_t ceil_divide(std::uint64_t x, std::uint64_t y) - { - return (x + y - 1) / y; - } - - static std::uint16_t ceil_divide(std::uint16_t x, std::uint16_t y) - { - return (x + y - std::uint16_t(1)) / y; - } }; class tree_reader : public tree_base @@ -231,11 +237,11 @@ namespace savvy ifs_->seekg(reader_->calculate_file_position(position_)); if (position_.level == leaf_level) - ifs_->read((char*) leaf_node_.data(), reader_->block_size_); + ifs_->read((char*) leaf_node_.data(), reader_->bucket_size()); else { traversal_chain_.emplace(reader_->entries_per_internal_node()); - ifs_->read((char*) (traversal_chain_.top().data()), reader_->block_size_); + ifs_->read((char*) (traversal_chain_.top().data()), reader_->bucket_size()); } traverse_right(); @@ -305,11 +311,11 @@ namespace savvy ifs_->seekg(reader_->calculate_file_position(position_)); if (position_.level == leaf_level) - ifs_->read((char*) leaf_node_.data(), reader_->block_size_); + ifs_->read((char*) leaf_node_.data(), reader_->bucket_size()); else { traversal_chain_.emplace(reader_->entries_per_internal_node()); - ifs_->read((char*) (traversal_chain_.top().data()), reader_->block_size_); + ifs_->read((char*) (traversal_chain_.top().data()), reader_->bucket_size()); } } } @@ -363,7 +369,9 @@ namespace savvy }; tree_reader(std::ifstream& file, std::uint8_t block_size_in_kib, std::uint64_t block_offset, const std::string& name, std::uint64_t entry_count) : - ifs_(file) + tree_base(block_size_in_kib, block_offset, entry_count), + ifs_(file), + name_(name) { // std::string version(7, '\0'); // ifs_.read(&version[0], version.size()); @@ -372,9 +380,6 @@ namespace savvy // block_size_ = static_cast(std::pow(8.0, (0x03 & block_size_exponent) + 2)); // ifs_.read((char*)(&entry_count_), 8); // entry_count_ = be64toh(entry_count_); - entry_count_ = entry_count; - - tree_base::init(block_size_in_kib, block_offset, name); } bool good() const { return ifs_.good(); } @@ -389,8 +394,11 @@ namespace savvy query ret(*this, ifs_, beg, end); return ret; } + + const std::string& name() const { return name_; } private: std::ifstream& ifs_; + std::string name_; }; enum class sort_type : std::uint8_t @@ -410,59 +418,59 @@ namespace savvy file_path_(file_path), input_file_(file_path, std::ios::binary) { - std::string version(7, '\0'); - input_file_.read(&version[0], version.size()); - std::string uuid(16, '\0'); - input_file_.read(&uuid[0], uuid.size()); + std::array footer; - std::uint8_t sort_byte; - input_file_.read((char*)(&sort_byte), 1); + input_file_.seekg(-(footer.size()), std::ios::end); + input_file_.read(footer.data(), footer.size()); + assert(input_file_.good()); - switch (sort_byte) - { - case (std::uint8_t)sort_type::left_point: sort_ = sort_type::left_point; break; - case (std::uint8_t)sort_type::right_point: sort_ = sort_type::right_point; break; - default: sort_ = sort_type::midpoint; - } + std::size_t bytes_parsed = 0; + std::string version(footer.begin() + (footer.size() - 7), footer.begin() + footer.size()); + bytes_parsed += version.size(); + + std::string uuid(footer.begin() + (footer.size() - bytes_parsed - 16), footer.begin() + (footer.size() - bytes_parsed)); + bytes_parsed += uuid.size(); + + std::uint16_t tree_details_size_be; + std::memcpy((char*)(&tree_details_size_be), footer.data() + (footer.size() - bytes_parsed - sizeof(tree_details_size_be)), sizeof(tree_details_size_be)); + bytes_parsed += sizeof(tree_details_size_be); + + std::uint16_t tree_details_size = be16toh(tree_details_size_be); std::uint8_t block_size_byte; - input_file_.read((char*)(&block_size_byte), 1); + std::memcpy((char*)(&block_size_byte), footer.data() + (footer.size() - bytes_parsed - sizeof(block_size_byte)), sizeof(block_size_byte)); + bytes_parsed += sizeof(block_size_byte); const std::uint32_t block_size = 1024u * (std::uint32_t(block_size_byte) + 1); - std::uint64_t header_size = 7 + 16 + 2; - struct tree_details { std::string name; std::uint64_t entry_count = 0; }; + input_file_.seekg(-(std::int64_t(footer.size()) + tree_details_size), std::ios::end); std::vector tree_details_array; - std::uint8_t name_size; - do + + int tree_details_bytes_left = tree_details_size; + while (tree_details_bytes_left > 0 && input_file_.good()) { - input_file_.read((char*)(&name_size), 1); + tree_details details; + std::getline(input_file_, details.name, '\0'); - if (name_size) - { - tree_details details; - details.name.resize(name_size); - input_file_.read(&details.name[0], name_size); + std::uint64_t entry_count_be = 0; + input_file_.read((char*)(&entry_count_be), 8); + details.entry_count = be64toh(entry_count_be); - std::uint64_t entry_count_be = 0; - input_file_.read((char*)(&entry_count_be), 8); - details.entry_count = be64toh(entry_count_be); + tree_details_bytes_left -= (details.name.size() + 1 + 8); - tree_details_array.emplace_back(std::move(details)); - header_size += (name_size + 8); - } - header_size += 1; + tree_details_array.emplace_back(std::move(details)); } - while (name_size > 0); - std::uint64_t block_count = detail::ceil_divide(header_size, block_size); + assert(tree_details_bytes_left == 0); + + std::uint64_t block_count = 0; trees_.reserve(tree_details_array.size()); for (auto it = tree_details_array.begin(); it != tree_details_array.end(); ++it) @@ -515,8 +523,6 @@ namespace savvy std::string file_path_; std::ifstream input_file_; std::vector trees_; - std::size_t tree_index_; - sort_type sort_; }; class reader::query @@ -559,16 +565,16 @@ namespace savvy public: typedef iterator self_type; typedef std::ptrdiff_t difference_type; - typedef tree_reader::entry value_type; + typedef entry value_type; typedef const value_type& reference; typedef const value_type* pointer; typedef std::bidirectional_iterator_tag iterator_category; iterator(std::istream& ifs, std::vector::iterator tree_query_it, tree_reader::query::iterator tree_query_beg, tree_reader::query::iterator tree_query_end) : - ifs_(&ifs), tree_it_(tree_query_it), tree_query_it_(tree_query_beg), - tree_query_end_(tree_query_end) + tree_query_end_(tree_query_end), + ifs_(&ifs) { } @@ -605,170 +611,318 @@ namespace savvy std::istream* ifs_; }; - class writer : public tree_base + class writer { public: - //writer(const std::string& file_path, const std::string& chrom, EntryIter beg, EntryIter end) : - writer(const std::string& file_path, std::map>& chrom_data, std::uint8_t block_size_in_kib) : + writer(const std::string& file_path, std::uint8_t block_size_in_kib = 4 - 1) : + file_path_(file_path), ofs_(file_path, std::ios::binary) { - const std::uint32_t block_size = 1024u * (std::uint32_t(block_size_in_kib) + 1); - std::uint64_t header_size = 7 + 16 + 2; - for (auto it = chrom_data.begin(); it != chrom_data.end(); ++it) - { - auto beg = it->second.begin(); - auto end = it->second.end(); - std::sort(beg, end, [](const tree_reader::entry& a, const tree_reader::entry& b) - { - double mid_a = static_cast(a.region_start()) + (static_cast(a.region_length()) / 2.0); - double mid_b = static_cast(b.region_start()) + (static_cast(b.region_length()) / 2.0); + this->block_size_ = 1024u * (std::uint32_t(block_size_in_kib) + 1); - return mid_a < mid_b; - }); + current_leaf_node_.reserve(detail::entries_per_leaf_node(block_size_)); + } - header_size += (1 + it->first.size() + 8); + ~writer() + { + if (chromosomes_.size()) + { + this->write_internal_nodes(); } - header_size += 1; - std::string header_block(detail::ceil_divide(header_size, std::uint64_t(block_size)) * block_size, '\0'); + std::uint16_t index_size = 0; + for (auto it = chromosomes_.begin(); it != chromosomes_.end(); ++it) + { + ofs_.write(it->first.c_str(), it->first.size() + 1); - std::size_t cur = 0; - std::memcpy(&header_block[cur], "s1r\x00\x01\x00\x00", 7); + std::uint64_t entry_count_be = htobe64(it->second); + ofs_.write((char *) (&entry_count_be), 8); - cur += 7; + index_size += (1 + it->first.size() + 8); + } - std::memset(&header_block[cur], '\0', 16); // uuid - cur += 16; + std::string footer_block(26, '\0'); + std::size_t cur = 0; - std::uint8_t sort = 0; - std::memcpy(&header_block[cur], (char*)(&sort), 1); + std::uint8_t block_size_in_kib = std::uint8_t(block_size_ / 1024 - 1); + std::memcpy(&footer_block[cur], (char *) (&block_size_in_kib), 1); cur += 1; - std::memcpy(&header_block[cur], (char*)(&block_size_in_kib), 1); - cur += 1; + std::uint16_t index_size_be = htobe16(index_size); + std::memcpy(&footer_block[cur], (char *) (&index_size_be), 2); + cur += 2; - for (auto it = chrom_data.begin(); it != chrom_data.end(); ++it) - { - std::uint8_t str_sz = it->first.size(); - std::memcpy(&header_block[cur], (char*)(&str_sz), 1); - cur += 1; + std::memset(&footer_block[cur], '\0', 16); // uuid + cur += 16; - std::memcpy(&header_block[cur], it->first.data(), str_sz); - cur += str_sz; + std::memcpy(&footer_block[cur], "s1r\x00\x01\x00\x00", 7); + ofs_.write(footer_block.data(), footer_block.size()); + } - std::uint64_t entry_count_be = htobe64((std::uint64_t)std::distance(it->second.begin(), it->second.end())); - std::memcpy(&header_block[cur], (char*)(&entry_count_be), 8); - cur += 8; - } - std::memset(&header_block[cur], '\0', 1); + writer& write(const std::string& chrom, const entry& e) + { + if (chromosomes_.empty()) + chromosomes_.emplace_back(chrom, 0); - ofs_.write(header_block.data(), header_block.size()); + if (chrom != chromosomes_.back().first) + { + this->write_internal_nodes(); + this->chromosomes_.emplace_back(chrom, 0); + } - std::uint64_t block_offset = header_block.size() / std::uint64_t(1024u * (std::uint32_t(block_size_in_kib) + 1)); - header_block.clear(); - header_block.shrink_to_fit(); + current_leaf_node_.emplace_back(e); + if (current_leaf_node_.size() == detail::entries_per_leaf_node(block_size_)) + { + ofs_.write((char *)(current_leaf_node_.data()), block_size_); + current_leaf_node_.resize(0); + } + ++(chromosomes_.back().second); - for (auto it = chrom_data.begin(); it != chrom_data.end(); ++it) + return *this; + } + + void write_internal_nodes() + { + if (this->current_leaf_node_.size()) { - auto beg = it->second.begin(); - auto end = it->second.end(); + this->current_leaf_node_.resize(detail::entries_per_leaf_node(block_size_)); + ofs_.write((char *)(current_leaf_node_.data()), block_size_); - entry_count_ = (std::uint64_t) std::distance(beg, end); + current_leaf_node_.resize(0); + } - tree_base::init(block_size_in_kib, block_offset, it->first); + ofs_.flush(); - std::vector, tree_position>> current_nodes_at_each_internal_level; - current_nodes_at_each_internal_level.reserve(this->tree_height() - 1); - for (std::size_t i = 0; i < this->tree_height() - 1; ++i) - current_nodes_at_each_internal_level.emplace_back(std::make_pair(std::vector(this->entries_per_internal_node()), tree_position(i, 0, 0))); + std::uint64_t num_leaf_nodes = detail::ceil_divide(this->chromosomes_.back().second, std::uint64_t(detail::entries_per_leaf_node(block_size_))); + tree_base tree(std::uint8_t(block_size_ / 1024 - 1), std::uint64_t(ofs_.tellp()) - block_size_ * num_leaf_nodes, this->chromosomes_.back().second); - std::vector current_leaf_node(this->entries_per_leaf_node()); - tree_position current_leaf_position(this->tree_height() - 1, 0, 0); + std::vector, tree_base::tree_position>> current_nodes_at_each_internal_level; + current_nodes_at_each_internal_level.reserve(tree.tree_height() - 1); + for (std::size_t i = 0; i < tree.tree_height() - 1; ++i) + current_nodes_at_each_internal_level.emplace_back(std::make_pair(std::vector(tree.entries_per_internal_node()), tree_base::tree_position(i, 0, 0))); - std::size_t node_counter = 0; - for (auto entry_it = beg; entry_it != end; ++entry_it) - { - const bool last_entry = entry_it + 1 == end; - const tree_reader::entry& e = *entry_it; + std::vector current_leaf_node(detail::entries_per_leaf_node(block_size_)); + tree_base::tree_position current_leaf_position(tree.tree_height() - 1, 0, 0); - current_leaf_node[current_leaf_position.entry_offset] = e; + std::ifstream ifs(file_path_, std::ios::binary); - if (current_leaf_position.entry_offset + 1 == current_leaf_node.size() || last_entry) - { - ofs_.seekp(this->calculate_file_position(current_leaf_position)); - ofs_.write((char*)(current_leaf_node.data()), block_size_); - ++node_counter; - std::uint64_t node_range_min = (std::uint64_t)-1; - std::uint64_t node_range_max = 0; - for (std::size_t i = 0; i <= current_leaf_position.entry_offset; ++i) - { - node_range_min = std::min(node_range_min, current_leaf_node[i].region_start()); - node_range_max = std::max(node_range_max, current_leaf_node[i].region_end()); - } + ifs.seekg(ofs_.tellp() - std::streamoff(block_size_ * num_leaf_nodes)); - for (auto rit = current_nodes_at_each_internal_level.rbegin(); rit != current_nodes_at_each_internal_level.rend(); ++rit) - { - rit->first[rit->second.entry_offset] = internal_entry(node_range_min, node_range_max); + for (std::size_t i = 0; i < num_leaf_nodes && ifs.good(); ++i) + { + bool last_leaf_node = (i + 1) == num_leaf_nodes; + ifs.read((char*)current_leaf_node.data(), block_size_); + std::uint32_t node_range_min = (std::uint32_t)-1; + std::uint32_t node_range_max = 0; + const std::size_t current_leaf_node_end = last_leaf_node + ? detail::entries_per_leaf_node(block_size_) - (this->chromosomes_.back().second % detail::entries_per_leaf_node(block_size_)) + : detail::entries_per_leaf_node(block_size_); + for (std::size_t i = 0; i < current_leaf_node_end; ++i) + { + node_range_min = std::min(node_range_min, current_leaf_node[i].region_start()); + node_range_max = std::max(node_range_max, current_leaf_node[i].region_end()); + } - if (rit->second.entry_offset + 1 == rit->first.size() || last_entry) - { - ofs_.seekp(this->calculate_file_position(rit->second)); - ofs_.write((char*)(rit->first.data()), block_size_); - ++node_counter; + for (auto rit = current_nodes_at_each_internal_level.rbegin(); rit != current_nodes_at_each_internal_level.rend(); ++rit) + { + rit->first[rit->second.entry_offset] = internal_entry(node_range_min, node_range_max); - node_range_min = (std::uint64_t)-1; - node_range_max = 0; - for (std::size_t i = 0; i <= rit->second.entry_offset; ++i) - { - node_range_min = std::min(node_range_min, rit->first[i].region_start()); - node_range_max = std::max(node_range_max, rit->first[i].region_end()); - } + if (rit->second.entry_offset + 1 == rit->first.size() || last_leaf_node) + { + ofs_.seekp(tree.calculate_file_position(rit->second)); + ofs_.write((char*)(rit->first.data()), block_size_); - rit->first = std::vector(this->entries_per_internal_node()); - ++(rit->second.node_offset); - rit->second.entry_offset = 0; - } - else - { - ++(rit->second.entry_offset); - break; - } + node_range_min = (std::uint32_t)-1; + node_range_max = 0; + for (std::size_t i = 0; i <= rit->second.entry_offset; ++i) + { + node_range_min = std::min(node_range_min, rit->first[i].region_start()); + node_range_max = std::max(node_range_max, rit->first[i].region_end()); } - current_leaf_node = std::vector(this->entries_per_leaf_node()); - ++(current_leaf_position.node_offset); - current_leaf_position.entry_offset = 0; + rit->first = std::vector(tree.entries_per_internal_node()); + ++(rit->second.node_offset); + rit->second.entry_offset = 0; } else { - ++(current_leaf_position.entry_offset); + ++(rit->second.entry_offset); + break; } } - block_offset += node_counter; } + + if (!ifs.good()) + ofs_.setstate(std::ios::badbit); } + + + + + +// writer(const std::string& file_path, std::map>& chrom_data, std::uint8_t block_size_in_kib) : +// ofs_(file_path, std::ios::binary) +// { +// const std::uint32_t block_size = 1024u * (std::uint32_t(block_size_in_kib) + 1); +// //std::uint64_t header_size = 7 + 16 + 2; +// +// for (auto it = chrom_data.begin(); it != chrom_data.end(); ++it) +// { +// auto beg = it->second.begin(); +// auto end = it->second.end(); +// std::sort(beg, end, [](const tree_reader::entry& a, const tree_reader::entry& b) +// { +// double mid_a = static_cast(a.region_start()) + (static_cast(a.region_length()) / 2.0); +// double mid_b = static_cast(b.region_start()) + (static_cast(b.region_length()) / 2.0); +// +// return mid_a < mid_b; +// }); +// } +// +// +// +// //std::uint64_t block_offset = header_block.size() / std::uint64_t(1024u * (std::uint32_t(block_size_in_kib) + 1)); +// std::size_t block_offset = 0; +// for (auto it = chrom_data.begin(); it != chrom_data.end(); ++it) +// { +// auto beg = it->second.begin(); +// auto end = it->second.end(); +// +// entry_count_ = (std::uint64_t) std::distance(beg, end); +// +// tree_base::init(block_size_in_kib, block_offset, it->first); +// +// std::vector, tree_position>> current_nodes_at_each_internal_level; +// current_nodes_at_each_internal_level.reserve(this->tree_height() - 1); +// for (std::size_t i = 0; i < this->tree_height() - 1; ++i) +// current_nodes_at_each_internal_level.emplace_back(std::make_pair(std::vector(this->entries_per_internal_node()), tree_position(i, 0, 0))); +// +// std::vector current_leaf_node(this->entries_per_leaf_node()); +// tree_position current_leaf_position(this->tree_height() - 1, 0, 0); +// +// std::size_t node_counter = 0; +// for (auto entry_it = beg; entry_it != end; ++entry_it) +// { +// const bool last_entry = entry_it + 1 == end; +// const tree_reader::entry& e = *entry_it; +// +// current_leaf_node[current_leaf_position.entry_offset] = e; +// +// if (current_leaf_position.entry_offset + 1 == current_leaf_node.size() || last_entry) +// { +// ofs_.seekp(this->calculate_file_position(current_leaf_position)); +// ofs_.write((char*)(current_leaf_node.data()), block_size_); +// ++node_counter; +// +// std::uint32_t node_range_min = (std::uint32_t)-1; +// std::uint32_t node_range_max = 0; +// for (std::size_t i = 0; i <= current_leaf_position.entry_offset; ++i) +// { +// node_range_min = std::min(node_range_min, current_leaf_node[i].region_start()); +// node_range_max = std::max(node_range_max, current_leaf_node[i].region_end()); +// } +// +// for (auto rit = current_nodes_at_each_internal_level.rbegin(); rit != current_nodes_at_each_internal_level.rend(); ++rit) +// { +// rit->first[rit->second.entry_offset] = internal_entry(node_range_min, node_range_max); +// +// +// +// if (rit->second.entry_offset + 1 == rit->first.size() || last_entry) +// { +// ofs_.seekp(this->calculate_file_position(rit->second)); +// ofs_.write((char*)(rit->first.data()), block_size_); +// ++node_counter; +// +// node_range_min = (std::uint32_t)-1; +// node_range_max = 0; +// for (std::size_t i = 0; i <= rit->second.entry_offset; ++i) +// { +// node_range_min = std::min(node_range_min, rit->first[i].region_start()); +// node_range_max = std::max(node_range_max, rit->first[i].region_end()); +// } +// +// rit->first = std::vector(this->entries_per_internal_node()); +// ++(rit->second.node_offset); +// rit->second.entry_offset = 0; +// } +// else +// { +// ++(rit->second.entry_offset); +// break; +// } +// } +// +// current_leaf_node = std::vector(this->entries_per_leaf_node()); +// ++(current_leaf_position.node_offset); +// current_leaf_position.entry_offset = 0; +// } +// else +// { +// ++(current_leaf_position.entry_offset); +// } +// } +// +// block_offset += node_counter; +// } +// +// std::uint16_t index_size = 0; +// for (auto it = chrom_data.begin(); it != chrom_data.end(); ++it) +// { +// ofs_.write(it->first.c_str(), it->first.size() + 1); +// +// std::uint64_t entry_count_be = htobe64((std::uint64_t)std::distance(it->second.begin(), it->second.end())); +// ofs_.write((char*)(&entry_count_be), 8); +// +// index_size += (1 + it->first.size() + 8); +// } +// +// std::string footer_block(26, '\0'); +// std::size_t cur = 0; +// +// std::memcpy(&footer_block[cur], (char*)(&block_size_in_kib), 1); +// cur += 1; +// +// std::uint16_t index_size_be = htobe16(index_size); +// std::memcpy(&footer_block[cur], (char*)(&index_size_be), 2); +// cur += 2; +// +// std::memset(&footer_block[cur], '\0', 16); // uuid +// cur += 16; +// +// std::memcpy(&footer_block[cur], "s1r\x00\x01\x00\x00", 7); +// ofs_.write(footer_block.data(), footer_block.size()); +// } + + explicit operator bool() const { return ofs_.good(); } + bool good() { return ofs_.good(); } + bool flush() { return ofs_.flush().good(); } private: + std::string file_path_; std::ofstream ofs_; + std::uint32_t block_size_; + std::vector current_leaf_node_; + std::vector> chromosomes_; }; - inline bool create_file(const std::string& file_path, std::map>& entries, std::uint8_t block_size_in_kib) - { - writer w(file_path, entries, block_size_in_kib); - return w.flush(); - } +// inline bool create_file(const std::string& file_path, std::map>& entries, std::uint8_t block_size_in_kib) +// { +// writer w(file_path, block_size_in_kib); +// return w.flush(); +// } inline reader::query reader::create_query(region reg) @@ -795,4 +949,4 @@ namespace savvy } } -#endif //LIBSAVVY_CMF_INDEX_READER_HPP \ No newline at end of file +#endif //LIBSAVVY_S1R_INDEX_READER_HPP \ No newline at end of file diff --git a/include/savvy/sav_reader.hpp b/include/savvy/sav_reader.hpp index 244bb0e..e85253f 100644 --- a/include/savvy/sav_reader.hpp +++ b/include/savvy/sav_reader.hpp @@ -1,5 +1,11 @@ -#ifndef LIBSAVVY_CMF_READER_HPP -#define LIBSAVVY_CMF_READER_HPP +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#ifndef LIBSAVVY_SAV_READER_HPP +#define LIBSAVVY_SAV_READER_HPP #include "allele_status.hpp" #include "varint.hpp" @@ -9,6 +15,7 @@ #include "variant_iterator.hpp" #include "utility.hpp" #include "data_format.hpp" +#include "utility.hpp" #include @@ -23,6 +30,7 @@ #include #include #include +#include namespace savvy { @@ -37,6 +45,16 @@ namespace savvy template static std::tuple decode(std::istreambuf_iterator& in_it, const std::istreambuf_iterator& end_it, const T& missing_value); }; + + template + struct allele_encoder + { + static const std::uint8_t multiplier = std::uint8_t(~(std::uint8_t(0xFF) << BitWidth)) + std::uint8_t(1); + template + static void encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it); + template + static std::int8_t encode(const T& allele); + }; } // namespace detail @@ -55,16 +73,15 @@ namespace savvy // } //################################################################// - template class reader_base { public: - template - reader_base(const std::string& file_path, T... data_formats); -#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 + reader_base(const std::string& file_path); + reader_base(const std::string& file_path, savvy::fmt data_format); + reader_base(reader_base&& source); reader_base& operator=(reader_base&& source); -#endif + //reader(const reader&) = delete; //reader& operator=(const reader&) = delete; virtual ~reader_base() {} @@ -78,59 +95,40 @@ namespace savvy // return good(); // } - explicit operator bool() const { return input_stream_.good(); } - bool good() const { return input_stream_.good(); } - bool fail() const { return input_stream_.fail(); } - bool bad() const { return input_stream_.bad(); } - bool eof() const { return input_stream_.eof(); } - std::uint64_t sample_count() const { return this->sample_ids_.size(); } - std::vector::const_iterator samples_begin() const { return sample_ids_.begin(); } - std::vector::const_iterator samples_end() const { return sample_ids_.end(); } + explicit operator bool() const { return input_stream_->good(); } + bool good() const { return input_stream_->good(); } + bool fail() const { return input_stream_->fail(); } + bool bad() const { return input_stream_->bad(); } + bool eof() const { return input_stream_->eof(); } + const std::vector& samples() const { return sample_ids_; } // std::vector::const_iterator prop_fields_begin() const { return metadata_fields_.begin(); } // std::vector::const_iterator prop_fields_end() const { return metadata_fields_.end(); } - std::vector prop_fields() const { return std::vector(metadata_fields_); } - std::vector> headers() const { return headers_; } + const std::vector& info_fields() const { return metadata_fields_; } + const std::vector>& headers() const { return headers_; } + savvy::fmt data_format() const { return file_data_format_; } + + /** + * + * @param subset IDs to include if they exist in file. + * @return intersect of subset and samples IDs in file. + */ + std::vector subset_samples(const std::set& subset); const std::string& file_path() const { return file_path_; } - std::streampos tellg() { return this->input_stream_.tellg(); } + std::streampos tellg() { return this->input_stream_->tellg(); } protected: - template - void clear_destinations(T& destination) - { - destination.resize(0); - } - - template - void clear_destinations(T& destination, T2&... other_destinations) - { - clear_destinations(destination); - clear_destinations(other_destinations...); - } - - void init_requested_formats(fmt f) - { - requested_data_formats_[requested_data_formats_.size() - 1] = f; - } - - template - void init_requested_formats(fmt f, T... args) - { - requested_data_formats_[requested_data_formats_.size() - (sizeof...(T) + 1)] = f; - init_requested_formats(args...); - } - void read_variant_details(site_info& annotations) { if (good()) { - std::istreambuf_iterator in_it(input_stream_); + std::istreambuf_iterator in_it(*input_stream_); std::istreambuf_iterator end_it; std::uint64_t sz; if (varint_decode(in_it, end_it, sz) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { @@ -138,19 +136,19 @@ namespace savvy std::string chrom; chrom.resize(sz); if (sz) - input_stream_.read(&chrom[0], sz); + input_stream_->read(&chrom[0], sz); std::uint64_t locus; if (varint_decode(in_it, end_it, locus) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { ++in_it; if (varint_decode(in_it, end_it, sz) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { @@ -158,11 +156,11 @@ namespace savvy std::string ref; ref.resize(sz); if (sz) - input_stream_.read(&ref[0], sz); + input_stream_->read(&ref[0], sz); if (varint_decode(in_it, end_it, sz) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { @@ -170,7 +168,7 @@ namespace savvy std::string alt; alt.resize(sz); if (sz) - input_stream_.read(&alt[0], sz); + input_stream_->read(&alt[0], sz); std::unordered_map props; props.reserve(this->metadata_fields_.size()); @@ -179,7 +177,7 @@ namespace savvy { if (varint_decode(in_it, end_it, sz) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); break; } else @@ -188,7 +186,7 @@ namespace savvy if (sz) { prop_val.resize(sz); - input_stream_.read(&prop_val[0], sz); + input_stream_->read(&prop_val[0], sz); props[key] = prop_val; } } @@ -207,13 +205,13 @@ namespace savvy { if (good()) { - std::istreambuf_iterator in_it(input_stream_); + std::istreambuf_iterator in_it(*input_stream_); std::istreambuf_iterator end_it; std::uint64_t ploidy_level; if (varint_decode(in_it, end_it, ploidy_level) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { @@ -228,7 +226,7 @@ namespace savvy in_it = prefixed_varint::decode(in_it, end_it, allele, offset); } - input_stream_.get(); + input_stream_->get(); } } } @@ -241,289 +239,464 @@ namespace savvy this->discard_genotypes_impl<7>(); } - template + template void read_genotypes_al(T& destination) { - if (file_data_format_ != fmt::allele) - input_stream_.setstate(std::ios::badbit); - if (good()) { - const typename T::value_type alt_value = typename T::value_type(1); - std::istreambuf_iterator in_it(input_stream_); + const auto missing_value = std::numeric_limits::quiet_NaN(); + std::istreambuf_iterator in_it(*input_stream_); std::istreambuf_iterator end_it; std::uint64_t ploidy_level; if (varint_decode(in_it, end_it, ploidy_level) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { - destination.resize(sample_count() * ploidy_level); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + + if (subset_map_.size()) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset] = allele; //(allele ? missing_value : alt_value); + destination.resize(subset_size_ * ploidy_level); + + if (BitWidth == 1) + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] * ploidy_level + (total_offset % ploidy_level)] = allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] * ploidy_level + (total_offset % ploidy_level)] = std::round(allele); //(allele ? missing_value : alt_value); + } + } + } + else + { + destination.resize(samples().size() * ploidy_level); + + if (BitWidth == 1) + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset] = allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset] = std::round(allele); //(allele ? missing_value : alt_value); + } + } } - input_stream_.get(); + input_stream_->get(); } } } - template + template void read_genotypes_gt(T& destination) { - if (file_data_format_ != fmt::allele) - input_stream_.setstate(std::ios::failbit); - if (good()) { - const typename T::value_type alt_value{1}; - std::istreambuf_iterator in_it(input_stream_); + const auto missing_value = std::numeric_limits::quiet_NaN(); + std::istreambuf_iterator in_it(*input_stream_); std::istreambuf_iterator end_it; std::uint64_t ploidy_level; if (varint_decode(in_it, end_it, ploidy_level) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { - destination.resize(sample_count()); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - std::uint64_t ploidy_counter = 0; - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + + if (subset_map_.size()) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<1>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset / ploidy_level] += allele; //(allele ? missing_value : alt_value); + destination.resize(subset_size_); + + if (BitWidth == 1) + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += std::round(allele); //(allele ? missing_value : alt_value); + } + } } + else + { + destination.resize(samples().size()); - input_stream_.get(); + if (BitWidth == 1) + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset / ploidy_level] += allele; //(allele ? missing_value : alt_value); + } + } + else + { + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset / ploidy_level] += std::round(allele); //(allele ? missing_value : alt_value); + } + } + } + + input_stream_->get(); } } } - template - void read_genotypes_gp(T& destination) - { - if (file_data_format_ != fmt::genotype_probability) - input_stream_.setstate(std::ios::failbit); +// template +// void read_genotypes_gp(T& destination) +// { +// if (file_data_format_ != fmt::genotype_probability) +// input_stream_->setstate(std::ios::failbit); +// +// if (good()) +// { +// const typename T::value_type alt_value = typename T::value_type(1); +// std::istreambuf_iterator in_it(input_stream_); +// std::istreambuf_iterator end_it; +// +// std::uint64_t ploidy_level; +// if (varint_decode(in_it, end_it, ploidy_level) == end_it) +// { +// this->input_stream_->setstate(std::ios::badbit); +// } +// else +// { +// std::size_t stride = ploidy_level + 1; +// destination.resize(sample_count() * stride); +// +// std::uint64_t sz; +// varint_decode(++in_it, end_it, sz); +// std::uint64_t total_offset = 0; +// //std::uint64_t next_ref_value_offset = 0; +// //std::uint64_t last_stride_offset = 0; +// +// for (std::size_t i = 0; i < sample_count(); ++i) +// { +// assert(i < destination.size()); +// destination[i * stride] = typename T::value_type(1); +// } +// +// +// for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) +// { +// typename T::value_type allele; +// std::uint64_t offset; +// std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); +// +// total_offset += offset; +// +// assert(total_offset < destination.size()); +// destination[total_offset] = allele; +// destination[(total_offset / stride) * stride] -= allele; +// } +// +// input_stream_->get(); +// } +// } +// } + template + void read_genotypes_hds(T& destination) + { if (good()) { - const typename T::value_type alt_value = typename T::value_type(1); - std::istreambuf_iterator in_it(input_stream_); + std::istreambuf_iterator in_it(*input_stream_); std::istreambuf_iterator end_it; std::uint64_t ploidy_level; if (varint_decode(in_it, end_it, ploidy_level) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { - std::size_t stride = ploidy_level + 1; - destination.resize(sample_count() * stride); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - std::uint64_t next_ref_value_offset = 0; - std::uint64_t last_stride_offset = 0; - for (std::size_t i = 0; i < sample_count(); ++i) + if (subset_map_.size()) { - assert(i < destination.size()); - destination[i * stride] = typename T::value_type(1); - } + destination.resize(subset_size_ * ploidy_level); - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] * ploidy_level + (total_offset % ploidy_level)] = allele; + } + } + else { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); + destination.resize(samples().size() * ploidy_level); + + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; + total_offset += offset; - assert(total_offset < destination.size()); - destination[total_offset] = allele; - destination[(total_offset / stride) * stride] -= allele; + assert(total_offset < destination.size()); + destination[total_offset] = allele; + } } - input_stream_.get(); + input_stream_->get(); } } } - template + template void read_genotypes_ds(T& destination) { - if (file_data_format_ != fmt::genotype_probability) - input_stream_.setstate(std::ios::failbit); - if (good()) { - const typename T::value_type alt_value{1}; - std::istreambuf_iterator in_it(input_stream_); + const typename T::value_type missing_value(std::numeric_limits::quiet_NaN()); + std::istreambuf_iterator in_it(*input_stream_); std::istreambuf_iterator end_it; std::uint64_t ploidy_level; if (varint_decode(in_it, end_it, ploidy_level) == end_it) { - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } else { - destination.resize(sample_count()); - std::uint64_t sz; varint_decode(++in_it, end_it, sz); std::uint64_t total_offset = 0; - std::uint64_t ploidy_counter = 0; - for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + + if (subset_map_.size()) { - typename T::value_type allele; - std::uint64_t offset; - std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); - total_offset += offset; - destination[total_offset / ploidy_level] += (allele * ((total_offset % ploidy_level) + 1)); + destination.resize(subset_size_); + + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + + const std::uint64_t sample_index = total_offset / ploidy_level; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index]] += allele; + } } + else + { + destination.resize(samples().size()); - input_stream_.get(); + for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) + { + typename T::value_type allele; + std::uint64_t offset; + std::tie(allele, offset) = detail::allele_decoder::decode(++in_it, end_it, missing_value); + total_offset += offset; + destination[total_offset / ploidy_level] += allele; + } + } +// destination.resize(sample_count()); +// +// std::uint64_t sz; +// varint_decode(++in_it, end_it, sz); +// std::uint64_t total_offset = 0; +// //std::uint64_t ploidy_counter = 0; +// +// if (file_data_format_ == fmt::genotype_probability) +// { +// for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) +// { +// typename T::value_type allele; +// std::uint64_t offset; +// std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); +// total_offset += offset; +// destination[total_offset / ploidy_level] += (allele * ((total_offset % ploidy_level) + 1)); +// } +// } +// else // fmt::haplotype_dosage +// { +// for (std::size_t i = 0; i < sz && in_it != end_it; ++i, ++total_offset) +// { +// typename T::value_type allele; +// std::uint64_t offset; +// std::tie(allele, offset) = detail::allele_decoder<7>::decode(++in_it, end_it, std::numeric_limits::quiet_NaN()); +// total_offset += offset; +// destination[total_offset / ploidy_level] += allele; +// } +// } + + input_stream_->get(); } } } template - void read_genotypes(std::size_t idx, T& destination) + void read_genotypes(T& destination) { - if (requested_data_formats_[idx] == file_data_format_) + destination.resize(0); + if (true) //requested_data_formats_[idx] == file_data_format_) { - if (requested_data_formats_[idx] == fmt::allele && file_data_format_ == fmt::allele) - read_genotypes_al(destination); - else if (requested_data_formats_[idx] == fmt::genotype && file_data_format_ == fmt::allele) - read_genotypes_gt(destination); - else if (requested_data_formats_[idx] == fmt::genotype_probability && file_data_format_ == fmt::genotype_probability) - read_genotypes_gp(destination); - else if (requested_data_formats_[idx] == fmt::dosage && file_data_format_ == fmt::genotype_probability) - read_genotypes_ds(destination); + if (requested_data_format_ == fmt::allele) + file_data_format_ == fmt::allele ? read_genotypes_al<1>(destination) : read_genotypes_al<7>(destination); + else if (requested_data_format_== fmt::genotype) + file_data_format_ == fmt::allele ? read_genotypes_gt<1>(destination) : read_genotypes_gt<7>(destination); +// else if (requested_data_formats_[idx] == fmt::genotype_probability && file_data_format_ == fmt::genotype_probability) +// read_genotypes_gp(destination); + else if (requested_data_format_ == fmt::dosage) + file_data_format_ == fmt::allele ? read_genotypes_ds<1>(destination) : read_genotypes_ds<7>(destination); + else if (requested_data_format_ == fmt::haplotype_dosage) + file_data_format_ == fmt::allele ? read_genotypes_hds<1>(destination) : read_genotypes_hds<7>(destination); else - input_stream_.setstate(std::ios::failbit); + input_stream_->setstate(std::ios::failbit); } else { - if (file_data_format_ == fmt::allele) - discard_genotypes(); - else - discard_genotypes(); + discard_genotypes(); } } - - template - void read_genotypes(std::size_t idx, T& destination, T2&... other_destinations) - { - if (requested_data_formats_[idx] == file_data_format_) - { - read_genotypes(idx, destination); - } - else - { - read_genotypes(idx + 1, other_destinations...); - } - } - - + private: + void parse_header(); protected: std::vector sample_ids_; - fmt file_data_format_; - std::array requested_data_formats_; - shrinkwrap::zstd::istream input_stream_; - std::string file_path_; + std::vector subset_map_; std::vector> headers_; std::vector metadata_fields_; + std::string file_path_; + std::uint64_t subset_size_; + std::unique_ptr input_stream_; + fmt file_data_format_; + fmt requested_data_format_; }; //################################################################// //################################################################// - template - class reader : public reader_base + class reader : public reader_base { public: - using reader_base::reader_base; + using reader_base::reader_base; - template - reader& operator>>(std::tuple& destination) + template + reader& operator>>(variant& destination) { - ::savvy::detail::apply([this](site_info& anno, auto&... args) - { - this->read(anno, std::forward(args)...); - }, - destination); - return *this; + return this->read(destination, destination.data()); } - template - reader& read(site_info& annotations, T&... destinations) + template + reader& read(site_info& annotations, T& destination) { - static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); - this->read_variant_details(annotations); - this->clear_destinations(destinations...); - this->read_genotypes(0, destinations...); + this->read_genotypes(destination); return *this; } }; - template - class indexed_reader : public reader_base + class indexed_reader : public reader_base { public: - template - indexed_reader(const std::string& file_path, const std::string& index_file_path, const region& reg, coord_bound bounding_type, T... data_formats) : - reader_base(file_path, data_formats...), + template + indexed_reader(const std::string& file_path, const std::string& index_file_path, const region& reg, coord_bound bounding_type, T data_format) : + reader_base(file_path, data_format), index_(index_file_path.size() ? index_file_path : file_path + ".s1r"), query_(index_.create_query(reg)), i_(query_.begin()), reg_(reg), + bounding_type_(bounding_type), current_offset_in_block_(0), - total_in_block_(0), - bounding_type_(bounding_type) + total_in_block_(0) { if (!index_.good()) - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } - template - indexed_reader(const std::string& file_path, const region& reg, T... data_formats) : - indexed_reader(file_path, std::string(""), reg, coord_bound::any, data_formats...) + indexed_reader(const std::string& file_path, const region& reg, savvy::fmt data_format) : + indexed_reader(file_path, std::string(""), reg, coord_bound::any, data_format) { } - template - indexed_reader(const std::string& file_path, const std::string& index_file_path, const region& reg, T... data_formats) : - indexed_reader(file_path, index_file_path, reg, coord_bound::any, data_formats...) + indexed_reader(const std::string& file_path, const std::string& index_file_path, const region& reg, savvy::fmt data_format) : + indexed_reader(file_path, index_file_path, reg, coord_bound::any, data_format) { } - template - indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, T... data_formats) : - indexed_reader(file_path, std::string(""), reg, bounding_type, data_formats...) + indexed_reader(const std::string& file_path, const region& reg, coord_bound bounding_type, savvy::fmt data_format) : + indexed_reader(file_path, std::string(""), reg, bounding_type, data_format) { } @@ -532,40 +705,31 @@ namespace savvy return index_.tree_names(); } - template - indexed_reader& operator>>(std::tuple& destination) + template + indexed_reader& operator>>(variant& destination) { - ::savvy::detail::apply([this](site_info& anno, auto&... args) - { - this->read(anno, std::forward(args)...); - }, - destination); - return *this; + return this->read(destination, destination.data()); } - - template - indexed_reader& read(site_info& annotations, T&... destinations) + template + indexed_reader& read(site_info& annotations, T& destination) { - static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); - while (this->good()) { if (current_offset_in_block_ >= total_in_block_) { if (i_ == query_.end()) - this->input_stream_.setstate(std::ios::eofbit); + this->input_stream_->setstate(std::ios::eofbit); else { total_in_block_ = std::uint32_t(0x000000000000FFFF & i_->value()) + 1; current_offset_in_block_ = 0; - this->input_stream_.seekg(std::streampos((i_->value() >> 16) & 0x0000FFFFFFFFFFFF)); + this->input_stream_->seekg(std::streampos((i_->value() >> 16) & 0x0000FFFFFFFFFFFF)); ++i_; } } this->read_variant_details(annotations); - this->clear_destinations(destinations...); - this->read_genotypes(0, destinations...); + this->read_genotypes(destination); ++current_offset_in_block_; if (region_compare(bounding_type_, annotations, reg_)) { @@ -575,22 +739,20 @@ namespace savvy return *this; } - template - indexed_reader& read_if(Pred fn, site_info& annotations, T&... destinations) + template + indexed_reader& read_if(Pred fn, site_info& annotations, T& destination) { - static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); - while (this->good()) { if (current_offset_in_block_ >= total_in_block_) { if (i_ == query_.end()) - this->input_stream_.setstate(std::ios::eofbit); + this->input_stream_->setstate(std::ios::eofbit); else { total_in_block_ = std::uint32_t(0x000000000000FFFF & i_->value()) + 1; current_offset_in_block_ = 0; - this->input_stream_.seekg(std::streampos((i_->value() >> 16) & 0x0000FFFFFFFFFFFF)); + this->input_stream_->seekg(std::streampos((i_->value() >> 16) & 0x0000FFFFFFFFFFFF)); ++i_; } } @@ -600,8 +762,7 @@ namespace savvy bool predicate_passed = fn(annotations); if (region_compare(bounding_type_, annotations, reg_) && predicate_passed) { - this->clear_destinations(destinations...); - this->read_genotypes(0, destinations...); + this->read_genotypes(destination); break; } else @@ -621,22 +782,12 @@ namespace savvy current_offset_in_block_ = 0; total_in_block_ = 0; reg_ = reg; - this->input_stream_.clear(); + this->input_stream_->clear(); query_ = index_.create_query(reg); i_ = query_.begin(); if (!index_.good()) - this->input_stream_.setstate(std::ios::badbit); + this->input_stream_->setstate(std::ios::badbit); } - - private: -// bool update_file_position() -// { -// if (i_ == query_.end()) -// return false; -// input_stream_.seekg(std::streampos(i_->value())); -// ++i_; -// return true; -// } private: s1r::reader index_; s1r::reader::query query_; @@ -653,11 +804,10 @@ namespace savvy public: struct options { - fmt data_format; std::int8_t compression_level; std::uint16_t block_size; + std::string index_path; options() : - data_format(fmt::allele), compression_level(3), block_size(2048) { @@ -665,15 +815,25 @@ namespace savvy }; template - writer(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, options opts = options()) : + writer(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, fmt data_format) : + writer(file_path, options(), samples_beg, samples_end, headers_beg, headers_end, data_format) + { + } + + template + writer(const std::string& file_path, const options& opts, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, fmt data_format) : output_buf_(opts.compression_level > 0 ? std::unique_ptr(new shrinkwrap::zstd::obuf(file_path, opts.compression_level)) : std::unique_ptr(create_std_filebuf(file_path, std::ios::binary | std::ios::out))), //opts.compression == compression_type::zstd ? std::unique_ptr(new shrinkwrap::zstd::obuf(file_path)) : std::unique_ptr(new std::filebuf(file_path, std::ios::binary))), output_stream_(output_buf_.get()), file_path_(file_path), + index_file_(opts.index_path.size() ? ::savvy::detail::make_unique(opts.index_path) : nullptr), + current_block_min_(std::numeric_limits::max()), + current_block_max_(0), sample_size_(samples_end - samples_beg), allele_count_(0), record_count_(0), + record_count_in_block_(0), block_size_(opts.block_size), - data_format_(opts.data_format) + data_format_(data_format) { std::string version_string("sav\x00\x01\x00\x00", 7); output_stream_.write(version_string.data(), version_string.size()); @@ -683,14 +843,19 @@ namespace savvy std::ostreambuf_iterator out_it(output_stream_); - - bool format_header_added = false; headers_.resize(std::distance(headers_beg, headers_end)); auto copy_res = std::copy_if(headers_beg, headers_end, headers_.begin(), [](const std::pair& kvp) { return kvp.first != "FORMAT"; }); headers_.resize(std::distance(headers_.begin(), copy_res)); // TODO: Handle unsupported formats. - headers_.push_back(std::make_pair("FORMAT", data_format_ == fmt::genotype_probability ? "" : "")); + const char* fmt_str; + if (data_format_ == fmt::haplotype_dosage) + fmt_str = ""; +// else if (data_format_ == fmt::genotype_probability) +// fmt_str = ""; + else + fmt_str = ""; + headers_.push_back(std::make_pair("FORMAT", fmt_str)); varint_encode(headers_.size(), out_it); for (auto it = headers_.begin(); it != headers_.end(); ++it) @@ -731,10 +896,34 @@ namespace savvy } + ~writer() + { + // TODO: This is only a temp solution. + if (index_file_) + { + if (record_count_in_block_) + { + auto file_pos = std::uint64_t(output_stream_.tellp()); + if (record_count_in_block_ > 0x10000) // Max records per block: 64*1024 + { + assert(!"Too many records in zstd frame to be indexed!"); + } + + if (file_pos > 0x0000FFFFFFFFFFFF) // Max file size: 256 TiB + { + assert(!"File size to large to be indexed!"); + } + + s1r::entry e(current_block_min_, current_block_max_, (file_pos << 16) | std::uint16_t(record_count_in_block_ - 1)); + index_file_->write(current_chromosome_, e); + } + } + } + template - writer& operator<<(std::tuple>& m) + writer& operator<<(const savvy::variant& v) { - write(std::get<0>(m), std::get<1>(m)); + write(v, v.data()); return *this; } @@ -799,10 +988,10 @@ namespace savvy output_stream_.write((char*)tmp.data(), tmp.size() * 4); } #else - template - void write(const site_info& annotations, const std::vector& data) + template + void write(const site_info& annotations, const VecT& data) { - if (output_stream_.good()) + if (this->good()) { if (data.size() % sample_size_ != 0) { @@ -814,9 +1003,30 @@ namespace savvy //if (allele_count_ >= 0x100000 || (record_count_ % 0x10000) == 0 || annotations.chromosome() != current_chromosome_) if (block_size_ != 0 && ((record_count_ % block_size_) == 0 || annotations.chromosome() != current_chromosome_)) { + if (index_file_ && record_count_in_block_) + { + auto file_pos = std::uint64_t(output_stream_.tellp()); + if (record_count_in_block_ > 0x10000) // Max records per block: 64*1024 + { + assert(!"Too many records in zstd frame to be indexed!"); + output_stream_.setstate(std::ios::badbit); + } + + if (file_pos > 0x0000FFFFFFFFFFFF) // Max file size: 256 TiB + { + assert(!"File size to large to be indexed!"); + output_stream_.setstate(std::ios::badbit); + } + + s1r::entry e(current_block_min_, current_block_max_, (file_pos << 16) | std::uint16_t(record_count_in_block_ - 1)); + index_file_->write(current_chromosome_, e); + } output_stream_.flush(); allele_count_ = 0; current_chromosome_ = annotations.chromosome(); + record_count_in_block_ = 0; + current_block_min_ = std::numeric_limits::max(); + current_block_max_ = 0; } std::ostreambuf_iterator os_it(output_stream_.rdbuf()); @@ -824,7 +1034,7 @@ namespace savvy varint_encode(annotations.chromosome().size(), os_it); std::copy(annotations.chromosome().begin(), annotations.chromosome().end(), os_it); - varint_encode(annotations.locus(), os_it); + varint_encode(annotations.position(), os_it); varint_encode(annotations.ref().size(), os_it); if (annotations.ref().size()) @@ -844,28 +1054,35 @@ namespace savvy std::copy(value.begin(), value.end(), os_it); } - if (data_format_ == fmt::genotype_probability) + if (data_format_ == fmt::haplotype_dosage) { - write_probs(data); + write_hap_dosages(data); } +// else if (data_format_ == fmt::genotype_probability) +// { +// write_probs(data); +// } else { write_alleles(data); } + current_block_min_ = std::min(current_block_min_, std::uint32_t(annotations.position())); + current_block_max_ = std::max(current_block_max_, std::uint32_t(annotations.position() + std::max(annotations.ref().size(), annotations.alt().size())) - 1); + ++record_count_in_block_; ++record_count_; } } } #endif explicit operator bool() const { return good(); } - bool good() const { return output_stream_.good(); } + bool good() const { return output_stream_.good() && (!index_file_ || index_file_->good()); } bool fail() const { return output_stream_.fail(); } bool bad() const { return output_stream_.bad(); } bool eof() const { return output_stream_.eof(); } static bool create_index(const std::string& input_file_path, std::string output_file_path = ""); - private: + protected: template static std::size_t get_string_size(T str); @@ -876,15 +1093,43 @@ namespace savvy return ret; } - template - struct allele_encoder + template + static void serialize_alleles(const std::vector& m, OutIt os_it) { - static const std::uint8_t multiplier = std::uint8_t(~(std::uint8_t(0xFF) << BitWidth)) + std::uint8_t(1); - template - static void encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it); - template - static std::int8_t encode(const T& allele); - }; + std::uint64_t last_pos = 0; + const auto beg = m.begin(); + for (auto it = beg; it != m.end(); ++it) + { + std::int8_t signed_allele = detail::allele_encoder::encode(*it); + if (signed_allele >= 0) + { + std::uint64_t dist = static_cast(std::distance(beg, it)); + std::uint64_t offset = dist - last_pos; + last_pos = dist + 1; + prefixed_varint::encode((std::uint8_t)(signed_allele), offset, os_it); + } + } + + } + + template + static void serialize_alleles(const savvy::compressed_vector& m, OutIt os_it) + { + std::uint64_t last_pos = 0; + auto end = m.end(); + for (auto it = m.begin(); it != end; ++it) + { + std::int8_t signed_allele = detail::allele_encoder::encode(*it); + if (signed_allele >= 0) + { + std::uint64_t dist = it.offset(); + std::uint64_t offset = dist - last_pos; + last_pos = dist + 1; + prefixed_varint::encode((std::uint8_t)(signed_allele), offset, os_it); + } + } + + } template void write_alleles(const std::vector& m) @@ -901,223 +1146,141 @@ namespace savvy std::uint64_t non_zero_count = m.size() - static_cast(std::count(m.begin(), m.end(), ref_value)); allele_count_ += non_zero_count; varint_encode(non_zero_count, os_it); - std::uint64_t last_pos = 0; - auto beg = m.begin(); - for (auto it = beg; it != m.end(); ++it) - { - //std::int8_t signed_allele = std::round((std::isnan(*it) ? T::value_type(0.5) : *it) * type_multiplier) - T::value_type(1); - if (*it != ref_value) - { - std::uint64_t dist = static_cast(std::distance(beg, it)); - std::uint64_t offset = dist - last_pos; - last_pos = dist + 1; - allele_encoder<1>::encode(*it, offset, os_it); - } - } + + serialize_alleles<1>(m, os_it); } template - void write_probs(const std::vector& m) + void write_alleles(const savvy::compressed_vector& m) { - const T ref_value = T(); + std::ostreambuf_iterator os_it(output_stream_.rdbuf()); + + std::uint32_t ploidy = std::uint32_t((m.size() / sample_size_) & 0xFFFFFFFF); + + // TODO: check modulus and set error if needed. + varint_encode(ploidy, os_it); + + allele_count_ += m.non_zero_size(); + varint_encode(m.non_zero_size(), os_it); + serialize_alleles<1>(m, os_it); + } + + template + void write_hap_dosages(const std::vector& m) + { std::ostreambuf_iterator os_it(output_stream_.rdbuf()); - std::uint32_t ploidy = std::uint32_t((m.size() / sample_size_) & 0xFFFFFFFF) - 1; - std::uint32_t stride = ploidy + 1; + std::uint32_t ploidy = std::uint32_t((m.size() / sample_size_) & 0xFFFFFFFF); // TODO: check modulus and set error if needed. varint_encode(ploidy, os_it); - auto beg = m.begin(); std::uint64_t non_zero_count = 0; - std::size_t c = 0; - for (auto it = m.begin(); it != m.end(); ++it,++c) + for (auto it = m.begin(); it != m.end(); ++it) { - if (c % stride != 0) - { - if (allele_encoder<7>::encode(*it) >= 0) - ++non_zero_count; - } + if (detail::allele_encoder<7>::encode(*it) >= 0) + ++non_zero_count; } - allele_count_ += non_zero_count; varint_encode(non_zero_count, os_it); - std::uint64_t last_pos = 0; - c = 0; - for (auto it = beg; it != m.end(); ++it,++c) + + serialize_alleles<7>(m, os_it); + } + + template + void write_hap_dosages(const savvy::compressed_vector& m) + { + std::ostreambuf_iterator os_it(output_stream_.rdbuf()); + + std::uint32_t ploidy = std::uint32_t((m.size() / sample_size_) & 0xFFFFFFFF); + + // TODO: check modulus and set error if needed. + varint_encode(ploidy, os_it); + + std::uint64_t non_zero_count = 0; + for (auto it = m.begin(); it != m.end(); ++it) { - if (c % stride != 0) - { - //std::int8_t signed_allele = std::round((std::isnan(*it) ? T::value_type(0.5) : *it) * type_multiplier) - T::value_type(1); - std::int8_t signed_allele = allele_encoder<7>::encode(*it); - if (signed_allele >= 0) - { - std::uint64_t dist = static_cast(std::distance(beg, it)); - std::uint64_t offset = dist - last_pos; - last_pos = dist + 1; - prefixed_varint<7>::encode((std::uint8_t)(signed_allele), offset, os_it); - } - } + if (detail::allele_encoder<7>::encode(*it) >= 0) + ++non_zero_count; } + + allele_count_ += non_zero_count; + varint_encode(non_zero_count, os_it); + + serialize_alleles<7>(m, os_it); } + +// template +// void write_probs(const std::vector& m) +// { +// const T ref_value = T(); +// +// std::ostreambuf_iterator os_it(output_stream_.rdbuf()); +// +// std::uint32_t ploidy = std::uint32_t((m.size() / sample_size_) & 0xFFFFFFFF) - 1; +// std::uint32_t stride = ploidy + 1; +// +// // TODO: check modulus and set error if needed. +// varint_encode(ploidy, os_it); +// +// auto beg = m.begin(); +// std::uint64_t non_zero_count = 0; +// std::size_t c = 0; +// for (auto it = m.begin(); it != m.end(); ++it,++c) +// { +// if (c % stride != 0) +// { +// if (allele_encoder<7>::encode(*it) >= 0) +// ++non_zero_count; +// } +// } +// +// +// allele_count_ += non_zero_count; +// varint_encode(non_zero_count, os_it); +// std::uint64_t last_pos = 0; +// c = 0; +// for (auto it = beg; it != m.end(); ++it,++c) +// { +// if (c % stride != 0) +// { +// //std::int8_t signed_allele = std::round((std::isnan(*it) ? T::value_type(0.5) : *it) * type_multiplier) - T::value_type(1); +// std::int8_t signed_allele = allele_encoder<7>::encode(*it); +// if (signed_allele >= 0) +// { +// std::uint64_t dist = static_cast(std::distance(beg, it)); +// std::uint64_t offset = dist - last_pos; +// last_pos = dist + 1; +// prefixed_varint<7>::encode((std::uint8_t)(signed_allele), offset, os_it); +// } +// } +// } +// } private: static const std::array empty_string_array; static const std::array, 0> empty_string_pair_array; - private: + protected: std::unique_ptr output_buf_; std::ostream output_stream_; std::vector> headers_; std::vector property_fields_; std::string file_path_; + std::unique_ptr index_file_; std::string current_chromosome_; + std::uint32_t current_block_min_; + std::uint32_t current_block_max_; std::uint64_t sample_size_; std::uint32_t metadata_fields_cnt_; std::size_t allele_count_; std::size_t record_count_; + std::size_t record_count_in_block_; std::uint16_t block_size_; fmt data_format_; }; - - - - - template - template - reader_base::reader_base(const std::string& file_path, T... data_formats) : - input_stream_(file_path), - file_path_(file_path), - file_data_format_(fmt::allele) - { - static_assert(VecCnt == sizeof...(T), "Number of requested format fields do not match VecCnt template parameter"); - - init_requested_formats(data_formats...); - - std::string version_string(7, '\0'); - input_stream_.read(&version_string[0], version_string.size()); - - std::string uuid(16, '\0'); - input_stream_.read(&uuid[0], uuid.size()); - - - std::istreambuf_iterator in_it(input_stream_); - std::istreambuf_iterator end; - - std::uint64_t headers_size; - if (varint_decode(in_it, end, headers_size) != end) - { - ++in_it; - headers_.reserve(headers_size); - - while (headers_size && in_it != end) - { - std::uint64_t key_size; - if (varint_decode(in_it, end, key_size) != end) - { - ++in_it; - if (key_size) - { - std::string key; - key.resize(key_size); - input_stream_.read(&key[0], key_size); - - std::uint64_t val_size; - if (varint_decode(in_it, end, val_size) != end) - { - ++in_it; - if (key_size) - { - std::string val; - val.resize(val_size); - input_stream_.read(&val[0], val_size); - - if (key == "INFO") - { - std::string info_field = parse_header_id(val); - metadata_fields_.push_back(std::move(info_field)); - } - else if (key == "FORMAT") - { - std::string format_field = parse_header_id(val); - if (format_field == "GT") - file_data_format_ = fmt::allele; - else if (format_field == "GP") - file_data_format_ = fmt::genotype_probability; - } - headers_.emplace_back(std::move(key), std::move(val)); - } - } - - } - } - --headers_size; - } - - if (!headers_size) - { - std::uint64_t sample_size; - if (varint_decode(in_it, end, sample_size) != end) - { - ++in_it; - sample_ids_.reserve(sample_size); - - std::uint64_t id_sz; - while (sample_size && varint_decode(in_it, end, id_sz) != end) - { - ++in_it; - sample_ids_.emplace_back(); - if (id_sz) - { - sample_ids_.back().resize(id_sz); - input_stream_.read(&sample_ids_.back()[0], id_sz); - } - --sample_size; - } - - if (!sample_size) - return; - } - } - } - - input_stream_.peek(); - } - -#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 - template - reader_base::reader_base(reader_base&& source) : - sample_ids_(std::move(source.sample_ids_)), - //sbuf_(std::move(source.sbuf_)), - //input_stream_(&sbuf_), - input_stream_(std::move(source.input_stream_)), - file_path_(std::move(source.file_path_)), - metadata_fields_(std::move(source.metadata_fields_)), - file_data_format_(source.file_data_format_), - requested_data_formats_(source.requested_data_formats_) - { - } - - template - reader_base& reader_base::operator=(reader_base&& source) - { - if (&source != this) - { - sample_ids_ = std::move(source.sample_ids_); - //sbuf_ = std::move(source.sbuf_); - //input_stream_.rdbuf(&sbuf_); - input_stream_ = std::move(source.input_stream_); - file_path_ = std::move(source.file_path_); - metadata_fields_ = std::move(source.metadata_fields_); - file_data_format_ = source.file_data_format_; - requested_data_formats_ = source.requested_data_formats_; - } - return *this; - } -#endif - template <> template inline std::tuple detail::allele_decoder<0>::decode(std::istreambuf_iterator& in_it, const std::istreambuf_iterator& end_it, const T& missing_value) @@ -1149,61 +1312,61 @@ namespace savvy return ret; } - - template - std::size_t writer::get_string_size(T str) - { - return str.size(); - } - - template <> - inline std::size_t writer::get_string_size(const char* str) - { - return std::strlen(str); - } - template<> template - inline void writer::allele_encoder<0>::encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it) + inline void detail::allele_encoder<0>::encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it) { varint_encode(offset, os_it); } template<> template - inline void writer::allele_encoder<1>::encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it) + inline void detail::allele_encoder<1>::encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it) { prefixed_varint<1>::encode(std::uint8_t(std::isnan(allele) ? 0 : 1), offset, os_it); } template template - inline void writer::allele_encoder::encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it) + inline void detail::allele_encoder::encode(const T& allele, std::uint64_t offset, std::ostreambuf_iterator& os_it) { prefixed_varint::encode(std::uint8_t(std::round((std::isnan(allele) ? T(0.5) : allele) * multiplier) - T(1)), offset, os_it); } template<> template - inline std::int8_t writer::allele_encoder<0>::encode(const T& allele) + inline std::int8_t detail::allele_encoder<0>::encode(const T& allele) { return -1; } template<> template - inline std::int8_t writer::allele_encoder<1>::encode(const T& allele) + inline std::int8_t detail::allele_encoder<1>::encode(const T& allele) { return std::int8_t(std::isnan(allele) ? 0 : (allele == T() ? -1 : 1)); } template template - inline std::int8_t writer::allele_encoder::encode(const T& allele) + inline std::int8_t detail::allele_encoder::encode(const T& allele) { return std::int8_t(std::round((std::isnan(allele) ? T(0.5) : allele) * multiplier) - T(1)); } + + + template + std::size_t writer::get_string_size(T str) + { + return str.size(); + } + + template <> + inline std::size_t writer::get_string_size(const char* str) + { + return std::strlen(str); + } } } -#endif //LIBSAVVY_CMF_READER_HPP +#endif //LIBSAVVY_SAV_READER_HPP diff --git a/include/savvy/savvy.hpp b/include/savvy/savvy.hpp index 092bc02..88bd90e 100644 --- a/include/savvy/savvy.hpp +++ b/include/savvy/savvy.hpp @@ -1,20 +1,21 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_SAVVY_HPP #define LIBSAVVY_SAVVY_HPP #include #include #include +#include "utility.hpp" namespace savvy { namespace detail { - template - std::unique_ptr make_unique(Args&& ...args) - { - return std::unique_ptr(new T(std::forward(args)...)); - } - bool has_extension(const std::string& fullString, const std::string& ext); #if !defined(__GNUC__) || defined(__clang__) || __GNUC__ > 4 diff --git a/include/savvy/site_info.hpp b/include/savvy/site_info.hpp index 7b29200..c95952b 100644 --- a/include/savvy/site_info.hpp +++ b/include/savvy/site_info.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef LIBSAVVY_SITE_INFO_HPP #define LIBSAVVY_SITE_INFO_HPP @@ -23,16 +28,16 @@ namespace savvy site_info( std::string&& chromosome, - std::uint64_t locus, + std::uint64_t pos, std::string&& ref, std::string&& alt, std::unordered_map&& properties) : + properties_(std::move(properties)), chromosome_(std::move(chromosome)), - locus_(locus), ref_(std::move(ref)), alt_(std::move(alt)), - properties_(std::move(properties)) + position_(pos) { } @@ -42,7 +47,8 @@ namespace savvy const std::string& chromosome() const { return chromosome_; } const std::string& ref() const { return ref_; } const std::string& alt() const { return alt_; } - std::uint64_t locus() const { return locus_; } + //[[deprecated]] std::uint64_t locus() const { return position_; } + std::uint64_t position() const { return position_; } const std::string& prop(const std::string& key) const { auto it = properties_.find(key); @@ -51,14 +57,24 @@ namespace savvy return it->second; } private: + std::unordered_map properties_; std::string chromosome_; std::string ref_; std::string alt_; - std::unordered_map properties_; - std::uint64_t locus_; + std::uint64_t position_; static const std::string empty_string; }; + template + class variant : public site_info + { + public: + T& data() { return data_; } + const T& data() const { return data_; } + private: + T data_; + }; + // template // class allele_vector : public variant_vector // { diff --git a/include/savvy/ublas_vector.hpp b/include/savvy/ublas_vector.hpp index 2010d7f..d681dbe 100644 --- a/include/savvy/ublas_vector.hpp +++ b/include/savvy/ublas_vector.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_UBLAS_VECTOR_HPP #define LIBSAVVY_UBLAS_VECTOR_HPP diff --git a/include/savvy/utility.hpp b/include/savvy/utility.hpp index ddfb127..7cc2f6c 100644 --- a/include/savvy/utility.hpp +++ b/include/savvy/utility.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_UTILITY_HPP #define LIBSAVVY_UTILITY_HPP @@ -7,18 +13,22 @@ namespace savvy { -// template -// std::unique_ptr make_unique(Args&&... args) -// { -// return std::unique_ptr(new T(std::forward(args)...)); -// } + namespace detail + { + template + std::unique_ptr make_unique(Args&&... args) + { + return std::unique_ptr(new T(std::forward(args)...)); + } + } - static const std::string version = SAVVY_VERSION; + std::string savvy_version(); std::string parse_header_id(std::string header_value); namespace detail { +#if __cpp_decltype_auto >= 201304 template decltype(auto) apply_impl(F&& fn, Tuple&& t, std::index_sequence) { @@ -34,6 +44,7 @@ namespace savvy std::forward(t), std::make_index_sequence()); } +#endif } } diff --git a/include/savvy/variant_iterator.hpp b/include/savvy/variant_iterator.hpp index dd5cd49..5ed5beb 100644 --- a/include/savvy/variant_iterator.hpp +++ b/include/savvy/variant_iterator.hpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #ifndef LIBSAVVY_VARIANT_ITERATOR_HPP #define LIBSAVVY_VARIANT_ITERATOR_HPP diff --git a/include/savvy/varint.hpp b/include/savvy/varint.hpp index 5e64198..20981da 100644 --- a/include/savvy/varint.hpp +++ b/include/savvy/varint.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_VARINT_HPP #define LIBSAVVY_VARINT_HPP diff --git a/include/savvy/vcf_reader.hpp b/include/savvy/vcf_reader.hpp index 03b827c..9cd29b9 100644 --- a/include/savvy/vcf_reader.hpp +++ b/include/savvy/vcf_reader.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + #ifndef LIBSAVVY_VCF_READER_HPP #define LIBSAVVY_VCF_READER_HPP @@ -26,6 +32,8 @@ #include #include #include +#include +#include namespace savvy { @@ -45,6 +53,7 @@ namespace savvy { public: reader_base() : + subset_size_(0), state_(std::ios::goodbit), gt_(nullptr), gt_sz_(0), @@ -52,6 +61,9 @@ namespace savvy {} reader_base(reader_base&& source) : + headers_(std::move(source.headers_)), + subset_map_(std::move(source.subset_map_)), + subset_size_(source.subset_size_), state_(source.state_), gt_(source.gt_), gt_sz_(source.gt_sz_), @@ -76,22 +88,18 @@ namespace savvy bool bad() const { return (state_ & std::ios::badbit) != 0; } bool eof() const { return (state_ & std::ios::eofbit) != 0; } - const char** samples_begin() const; - const char** samples_end() const; - - std::vector prop_fields() const; - std::vector> headers() const; -// std::vector::const_iterator prop_fields_begin() const { return property_fields_.begin(); } -// std::vector::const_iterator prop_fields_end() const { return property_fields_.end(); } - std::uint64_t sample_count() const; + std::vector subset_samples(const std::set& subset); - template - bool read_variant(site_info& annotations, T&... destinations); + const std::vector& samples() const; + const std::vector& info_fields() const; + const std::vector>& headers() const { return headers_; } protected: virtual bcf_hdr_t* hts_hdr() const = 0; virtual bcf1_t* hts_rec() const = 0; virtual bool read_hts_record() = 0; + void init_sample_ids(); void init_property_fields(); + void init_headers(); template void clear_destinations(T& destination); @@ -101,11 +109,11 @@ namespace savvy void read_variant_details(site_info& destination); template - void read_requested_genos(T&... vec); + std::size_t read_requested_genos(T&... vec); template - void read_genos_to(fmt data_format, T1& vec); + bool read_genos_to(fmt data_format, T1& vec); template - void read_genos_to(fmt data_format, T1& vec, T2&... others); + bool read_genos_to(fmt data_format, T1& vec, T2&... others); template void read_genotypes_al(T& destination); @@ -116,6 +124,8 @@ namespace savvy template void read_genotypes_ds(T& destination); template + void read_genotypes_hds(T& destination); + template void read_genotypes_gl(T& destination); template void read_genotypes_pl(T& destination); @@ -124,12 +134,16 @@ namespace savvy template void init_requested_formats(fmt f, T2... args); protected: + std::vector> headers_; + std::vector sample_ids_; + std::vector property_fields_; + std::array requested_data_formats_; + std::vector subset_map_; + std::uint64_t subset_size_; std::ios::iostate state_; int* gt_; int gt_sz_; int allele_index_; - std::vector property_fields_; // TODO: This member is no longer necessary not that prop_fields_{begin,end}() methods are gone. Eventually remove this and init_property_fileds() - std::array requested_data_formats_; }; //################################################################// @@ -146,15 +160,11 @@ namespace savvy reader& operator=(const reader&) = delete; ~reader(); - template - reader& operator>>(std::tuple& destination) + template + reader& operator>>(variant& destination) { - ::savvy::detail::apply([this](site_info& anno, auto&... args) - { - this->read(anno, std::forward(args)...); - }, - destination); - return *this; + //static_assert(VecCnt == 1, "Extraction operator only supported with single format field reader"); + return this->read(destination, destination.data()); } template @@ -223,9 +233,8 @@ namespace savvy std::vector chromosomes() const; - template - indexed_reader& operator>>(std::tuple& destination); - + template + indexed_reader& operator>>(variant& destination); template indexed_reader& read(site_info& annotations, T&... destinations); @@ -247,39 +256,70 @@ namespace savvy }; //################################################################// + //################################################################// + template + class writer + { + public: + struct options + { + compression_type compression; + options() : + compression(compression_type::none) + {} + }; + + template + writer(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, Fmt... data_formats); + + template + writer(const std::string& file_path, const options& opts, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, Fmt... data_formats); + + void init_format_fields(savvy::fmt f); + + template + void init_format_fields(savvy::fmt f, Fmt... other); + + template + writer& write(const site_info& anno, const T&... data); + template + writer& operator<<(const variant>& v); + bool good() { return output_stream_->good(); } + private: + template + static const std::vector& get_vec(std::size_t offset, const std::vector& m); + template + static const std::vector& get_vec(std::size_t offset, const std::vector& m, const T2&... other); + template + void write_multi_sample_level_data(const std::size_t ploidy, const T&... data); + template + void write_single_sample_level_data(const std::size_t ploidy, const T& data); + private: + std::vector info_fields_; + std::vector format_fields_; + std::unique_ptr output_stream_; + std::size_t sample_size_; + }; //################################################################// - template - const char** reader_base::samples_begin() const - { - return hts_hdr() ? (const char**) hts_hdr()->samples : nullptr; - } - template - const char** reader_base::samples_end() const - { - return hts_hdr() ? (const char**) ((hts_hdr()->samples) + bcf_hdr_nsamples(hts_hdr())) : nullptr; - } - template - std::uint64_t reader_base::sample_count() const - { - return static_cast(bcf_hdr_nsamples(hts_hdr())); - } + + + + //################################################################// template - std::vector> reader_base::headers() const + void reader_base::init_headers() { - std::vector> ret; - bcf_hdr_t* hdr = hts_hdr(); if (hdr) { - ret.reserve(hdr->nhrec - 1); + this->headers_.reserve(std::size_t(hdr->nhrec - 1)); for (int i = 1; i < hdr->nhrec; ++i) { std::string key, val; @@ -310,19 +350,65 @@ namespace savvy } if (key.size()) - ret.emplace_back(std::move(key), std::move(val)); + this->headers_.emplace_back(std::move(key), std::move(val)); //ret.insert(std::upper_bound(ret.begin(), ret.end(), std::make_pair(key, std::string()), [](const auto& a, const auto& b) { return a.first < b.first; }), {std::move(key), std::move(val)}); } } + } + + template + std::vector reader_base::subset_samples(const std::set& subset) + { + std::vector ret; + const char** beg = hts_hdr() ? (const char**) hts_hdr()->samples : nullptr; + const char** end = hts_hdr() ? (const char**) (hts_hdr()->samples + bcf_hdr_nsamples(hts_hdr())) : nullptr; + ret.reserve(std::min(subset.size(), (std::size_t)std::distance(beg, end))); + + subset_map_.clear(); + subset_map_.resize((std::size_t)std::distance(beg, end), std::numeric_limits::max()); + std::uint64_t subset_index = 0; + for (auto it = beg; it != end; ++it) + { + if (subset.find(*it) != subset.end()) + { + subset_map_[std::distance(beg, it)] = subset_index; + ret.push_back(*it); + ++subset_index; + } + } + + subset_size_ = subset_index; return ret; } template - std::vector reader_base::prop_fields() const + const std::vector& reader_base::samples() const { - std::vector ret(property_fields_); - return ret; + return sample_ids_; + } + + template + const std::vector& reader_base::info_fields() const + { + return property_fields_; + } + + template + void reader_base::init_sample_ids() + { + bcf_hdr_t* hdr = hts_hdr(); + if (hdr) + { + const char **beg = (const char **) (hdr->samples); + const char **end = (const char **) (hdr->samples + bcf_hdr_nsamples(hdr)); + sample_ids_.reserve(end - beg); + + for (; beg != end; ++beg) + { + sample_ids_.emplace_back(*beg); + } + } } template @@ -331,6 +417,7 @@ namespace savvy bcf_hdr_t* hdr = hts_hdr(); if (hdr) { + this->property_fields_ = {"ID", "QUAL", "FILTER"}; for (int i = 0; i < hdr->nhrec; ++i) { @@ -382,24 +469,13 @@ namespace savvy template template - bool reader_base::read_variant(site_info& annotations, T&... destinations) - { - static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); - read_variant_details(annotations); - read_requested_genos(destinations...); - - return good(); - } - - template - template - void reader_base::read_requested_genos(T&... destinations) + std::size_t reader_base::read_requested_genos(T&... destinations) { + std::size_t cnt = 0; clear_destinations(destinations...); bcf_unpack(hts_rec(), BCF_UN_ALL); for (int i = 0; i < hts_rec()->n_fmt; ++i) { - auto rec_ptr = hts_rec(); int fmt_id = hts_rec()->d.fmt[i].id; std::string fmt_key = hts_hdr()->id[BCF_DT_ID][fmt_id].key; @@ -410,40 +486,46 @@ namespace savvy { if (gt_idx < VecCnt) { - read_genos_to<0>(fmt::genotype, destinations...); + cnt += read_genos_to<0>(fmt::genotype, destinations...); } else // allele_idx < VecCnt { - read_genos_to<0>(fmt::allele, destinations...); + cnt += read_genos_to<0>(fmt::allele, destinations...); } } else if (fmt_key == "DS") { - read_genos_to<0>(fmt::dosage, destinations...); + cnt += read_genos_to<0>(fmt::dosage, destinations...); + } + else if (fmt_key == "HDS") + { + cnt += read_genos_to<0>(fmt::haplotype_dosage, destinations...); } else if (fmt_key == "GP") { - read_genos_to<0>(fmt::genotype_probability, destinations...); + cnt += read_genos_to<0>(fmt::genotype_probability, destinations...); } else if (fmt_key == "GL") { - read_genos_to<0>(fmt::genotype_likelihood, destinations...); + cnt += read_genos_to<0>(fmt::genotype_likelihood, destinations...); } else if (fmt_key == "PL") { - read_genos_to<0>(fmt::phred_scaled_genotype_likelihood, destinations...); + cnt += read_genos_to<0>(fmt::phred_scaled_genotype_likelihood, destinations...); } else { // Discard } } + return cnt; } template template - void reader_base::read_genos_to(fmt data_format, T1& destination) + bool reader_base::read_genos_to(fmt data_format, T1& destination) { + bool ret = true; if (requested_data_formats_[Idx] == data_format) { switch (data_format) @@ -460,6 +542,9 @@ namespace savvy case fmt::dosage: read_genotypes_ds(destination); break; + case fmt::haplotype_dosage: + read_genotypes_hds(destination); + break; case fmt::genotype_likelihood: read_genotypes_gl(destination); break; @@ -471,20 +556,22 @@ namespace savvy else { // Discard Genotypes + ret = false; } + return ret; } template template - void reader_base::read_genos_to(fmt data_format, T1& vec, T2&... others) + bool reader_base::read_genos_to(fmt data_format, T1& vec, T2&... others) { if (requested_data_formats_[Idx] == data_format) { - read_genos_to(data_format, vec); + return read_genos_to(data_format, vec); } else { - read_genos_to(data_format, others...); + return read_genos_to(data_format, others...); } } @@ -512,9 +599,16 @@ namespace savvy std::unordered_map props; props.reserve(n_info + 2); - std::string qual(std::to_string(hts_rec()->qual)); - qual.erase(qual.find_last_not_of(".0") + 1); // rtrim zeros. - props["QUAL"] = std::move(qual); + if (std::isnan(hts_rec()->qual)) + { + props["QUAL"] = "."; + } + else + { + std::string qual(std::to_string(hts_rec()->qual)); + qual.erase(qual.find_last_not_of(".0") + 1); // rtrim zeros. + props["QUAL"] = std::move(qual); + } std::stringstream ss; for (std::size_t i = 0; i < n_flt; ++i) @@ -580,21 +674,41 @@ namespace savvy const typename T::value_type alt_value = typename T::value_type(1); if (allele_index_ > 1 || bcf_get_genotypes(hts_hdr(), hts_rec(), &(gt_), &(gt_sz_)) >= 0) { - if (gt_sz_ % sample_count() != 0) + if (gt_sz_ % samples().size() != 0) { // TODO: mixed ploidy at site error. } else { - destination.resize(gt_sz_); - const int allele_index_plus_one = allele_index_ + 1; - for (std::size_t i = 0; i < gt_sz_; ++i) + const std::uint64_t ploidy(gt_sz_ / samples().size()); + if (subset_map_.size()) + { + destination.resize(subset_size_ * ploidy); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (gt_[i] == bcf_gt_missing) + destination[subset_map_[sample_index] * ploidy + (i % ploidy)] = std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[subset_map_[sample_index] * ploidy + (i % ploidy)] = alt_value; + } + } + } + else { - if (gt_[i] == bcf_gt_missing) - destination[i] = std::numeric_limits::quiet_NaN(); - else if ((gt_[i] >> 1) == allele_index_plus_one) - destination[i] = alt_value; + destination.resize(gt_sz_); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (gt_[i] == bcf_gt_missing) + destination[i] = std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[i] = alt_value; + } } return; } @@ -613,22 +727,42 @@ namespace savvy const typename T::value_type alt_value = typename T::value_type(1); if (allele_index_ > 1 || bcf_get_genotypes(hts_hdr(), hts_rec(), &(gt_), &(gt_sz_)) >= 0) { - if (gt_sz_ % sample_count() != 0) + if (gt_sz_ % samples().size() != 0) { // TODO: mixed ploidy at site error. } else { - const std::uint64_t ploidy(gt_sz_ / sample_count()); - destination.resize(sample_count()); - + const std::uint64_t ploidy(gt_sz_ / samples().size()); const int allele_index_plus_one = allele_index_ + 1; - for (std::size_t i = 0; i < gt_sz_; ++i) + + if (subset_map_.size()) + { + destination.resize(subset_size_); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (gt_[i] == bcf_gt_missing) + destination[subset_map_[sample_index]] += std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[subset_map_[sample_index]] += alt_value; + } + } + } + else { - if (gt_[i] == bcf_gt_missing) - destination[i / ploidy] += std::numeric_limits::quiet_NaN(); - else if ((gt_[i] >> 1) == allele_index_plus_one) - destination[i / ploidy] += alt_value; + destination.resize(samples().size()); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (gt_[i] == bcf_gt_missing) + destination[i / ploidy] += std::numeric_limits::quiet_NaN(); + else if ((gt_[i] >> 1) == allele_index_plus_one) + destination[i / ploidy] += alt_value; + } } return; } @@ -660,12 +794,91 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample); - destination.resize(gt_sz_); + const typename T::value_type zero_value{0}; - float* ds = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) + { + destination.resize(subset_size_); + + float* ds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (ds[i] != zero_value) + destination[subset_map_[sample_index]] = ds[i]; + } + } + } + else { - destination[i] = ds[i]; + destination.resize(gt_sz_); + + float* ds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (ds[i] != zero_value) + destination[i] = ds[i]; + } + } + return; + } + } + + this->state_ = std::ios::failbit; + } + } + + template + template + void reader_base::read_genotypes_hds(T& destination) + { + if (good()) + { + if (hts_rec()->n_allele > 2) + { + state_ = std::ios::failbit; // multi allelic HDS not supported. + return; + } + + if (bcf_get_format_float(hts_hdr(),hts_rec(),"HDS", &(gt_), &(gt_sz_)) >= 0) + { + int num_samples = hts_hdr()->n[BCF_DT_SAMPLE]; + if (gt_sz_ % num_samples != 0) + { + // TODO: mixed ploidy at site error. + } + else + { + const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample); + const typename T::value_type zero_value{0}; + + if (subset_map_.size()) + { + destination.resize(subset_size_ * ploidy); + + float* hds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy; + if (subset_map_[sample_index] != std::numeric_limits::max()) + { + if (hds[i] != zero_value) + destination[subset_map_[sample_index] * ploidy + (i % ploidy)] = hds[i]; + } + } + } + else + { + destination.resize(gt_sz_); + + float* hds = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + if (hds[i] != zero_value) + destination[i] = hds[i]; + } } return; } @@ -697,12 +910,29 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample - 1); - destination.resize(gt_sz_); + const std::uint64_t ploidy_plus_one = ploidy + 1; - float* gp = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) { - destination[i] = gp[i]; + destination.resize(subset_size_ * ploidy_plus_one); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy_plus_one; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] * ploidy_plus_one + (i % ploidy_plus_one)] = gp[i]; + } + } + else + { + destination.resize(gt_sz_); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = gp[i]; + } } return; } @@ -734,12 +964,29 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample - 1); - destination.resize(gt_sz_); + const std::uint64_t ploidy_plus_one = ploidy + 1; - float* gp = (float*)(void*)(gt_); - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) { - destination[i] = gp[i]; + destination.resize(subset_size_ * ploidy_plus_one); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy_plus_one; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] * ploidy + (i % ploidy_plus_one)] = gp[i]; + } + } + else + { + destination.resize(gt_sz_); + + float* gp = (float*) (void*) (gt_); + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = gp[i]; + } } return; } @@ -771,11 +1018,28 @@ namespace savvy else { const std::uint64_t ploidy(gt_sz_ / hts_rec()->n_sample - 1); - destination.resize(gt_sz_); + const std::uint64_t ploidy_plus_one = ploidy + 1; - for (std::size_t i = 0; i < gt_sz_; ++i) + if (subset_map_.size()) { - destination[i] = gt_[i]; + destination.resize(subset_size_ * ploidy_plus_one); + + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + const std::uint64_t sample_index = i / ploidy_plus_one; + if (subset_map_[sample_index] != std::numeric_limits::max()) + destination[subset_map_[sample_index] * ploidy + (i % ploidy_plus_one)] = gt_[i]; + } + } + else + { + destination.resize(gt_sz_); + + for (std::size_t i = 0; i < gt_sz_; ++i) + { + destination[i] = gt_[i]; + } } return; } @@ -806,7 +1070,11 @@ namespace savvy if (!hts_hdr_) this->state_ = std::ios::badbit; else + { this->init_property_fields(); + this->init_headers(); + this->init_sample_ids(); + } } } @@ -867,35 +1135,41 @@ namespace savvy template reader& reader::read(site_info& annotations, T&... destinations) { - this->read_variant(annotations, destinations...); + static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); + std::size_t vecs_read = 0; + while (vecs_read == 0) + { + this->read_variant_details(annotations); + vecs_read = this->read_requested_genos(destinations...); + } + return *this; } //################################################################// + + //################################################################// template - template - indexed_reader& indexed_reader::operator>>(std::tuple& destination) + template + indexed_reader& indexed_reader::operator>>(variant& destination) { - ::savvy::detail::apply([this](site_info& anno, auto&... args) - { - this->read(anno, std::forward(args)...); - }, - destination); - return *this; + //static_assert(VecCnt == 1, "Extraction operator only supported with one format field"); + return this->read(destination, destination.data()); } template template indexed_reader& indexed_reader::read(site_info& annotations, T&... destinations) { + static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); while (this->good()) { - this->read_variant(annotations, destinations...); + this->read_variant_details(annotations); if (this->good() && region_compare(bounding_type_, annotations, region_)) { - this->read_requested_genos(destinations...); - break; + if (this->read_requested_genos(destinations...) > 0) + break; } } return *this; @@ -905,6 +1179,7 @@ namespace savvy template indexed_reader& indexed_reader::read_if(Pred fn, site_info& annotations, T&... destinations) { + static_assert(VecCnt == sizeof...(T), "The number of destination vectors must match class template size"); while (this->good()) { this->read_variant_details(annotations); @@ -912,8 +1187,8 @@ namespace savvy { if (fn(annotations) && region_compare(bounding_type_, annotations, region_)) { - this->read_requested_genos(destinations...); - break; + if (this->read_requested_genos(destinations...) > 0) + break; } } } @@ -945,9 +1220,13 @@ namespace savvy if (reg.from() > 1 || reg.to() != std::numeric_limits::max()) contigs << ":" << reg.from() << "-" << reg.to(); - bcf_sr_set_regions(synced_readers_, contigs.str().c_str(), 0); - if (bcf_sr_add_reader(synced_readers_, file_path_.c_str())) + + if (bcf_sr_set_regions(synced_readers_, contigs.str().c_str(), 0) == 0 && bcf_sr_add_reader(synced_readers_, file_path_.c_str()) == 1) + { this->init_property_fields(); + this->init_headers(); + this->init_sample_ids(); + } else this->state_ = std::ios::badbit; } @@ -1016,32 +1295,31 @@ namespace savvy - - - //################################################################// - class writer + template + template + writer::writer(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, Fmt... data_formats) : + writer(file_path, options(), samples_beg, samples_end, headers_beg, headers_end, data_formats...) { - public: - struct options - { - compression_type compression; - options() : - compression(compression_type::none) - {} - }; + } - template - writer(const std::string& file_path, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, const options& opts = options()) : - output_stream_(opts.compression == compression_type::none ? std::unique_ptr(new std::ofstream(file_path)) : std::unique_ptr(new shrinkwrap::bgz::ostream(file_path))), - sample_size_(0) - { - (*output_stream_) << "##fileformat=VCFv4.2" << std::endl; + template + template + writer::writer(const std::string& file_path, const options& opts, RandAccessStringIterator samples_beg, RandAccessStringIterator samples_end, RandAccessKVPIterator headers_beg, RandAccessKVPIterator headers_end, Fmt... data_formats) : + output_stream_(opts.compression == compression_type::none ? std::unique_ptr(new std::ofstream(file_path)) : std::unique_ptr(new shrinkwrap::bgz::ostream(file_path))), + sample_size_(0) + { + static_assert(VecCnt == sizeof...(Fmt), "Number of requested format fields do not match VecCnt template parameter"); - std::ostreambuf_iterator out_it(*output_stream_); + this->format_fields_.reserve(sizeof...(Fmt)); + this->init_format_fields(data_formats...); + + (*output_stream_) << "##fileformat=VCFv4.2" << std::endl; - for (auto it = headers_beg; it != headers_end; ++it) + for (auto it = headers_beg; it != headers_end; ++it) + { + if (it->first != "FORMAT") { (*output_stream_) << (std::string("##") + it->first + "=" + it->second) << std::endl; @@ -1085,99 +1363,323 @@ namespace savvy } } } + } - (*output_stream_) << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - for (auto it = samples_beg; it != samples_end; ++it) - { - (*output_stream_) << (std::string("\t") + *it); - ++sample_size_; - } - (*output_stream_) << std::endl; + for (auto f : this->format_fields_) + { + if (f == savvy::fmt::allele) + (*output_stream_) << "##FORMAT=" << std::endl; + else if (f == savvy::fmt::haplotype_dosage) + (*output_stream_) << "##FORMAT=" << std::endl; + else if (f == savvy::fmt::dosage) + (*output_stream_) << "##FORMAT=" << std::endl; + //else if (f == savvy::fmt::genotype_probability) + // (*output_stream_) << "##FORMAT"="" << std::endl; // TODO: Handle other ploidy levels. } - template - writer& write(const site_info& anno, const std::vector& data) + (*output_stream_) << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; + for (auto it = samples_beg; it != samples_end; ++it) { + (*output_stream_) << (std::string("\t") + *it); + ++sample_size_; + } + + (*output_stream_) << std::endl; + } + + template + void writer::init_format_fields(savvy::fmt f) + { + this->format_fields_.push_back(f); + } + + template + template + void writer::init_format_fields(savvy::fmt f, Fmt... other) + { + this->format_fields_.push_back(f); + this->init_format_fields(other...); + } + + template + template + writer& writer::operator<<(const variant>& v) + { + return this->write(v, v.data()); + } + + template + template + writer& writer::write(const site_info& anno, const T&... data) + { + static_assert(VecCnt == sizeof...(T), "The number of source vectors must match class template size"); + + if (good()) + { + // VALIDATE VECTOR SIZES + std::size_t ploidy = 0; + for (std::size_t i = 0; i < format_fields_.size(); ++i) + { + fmt f = format_fields_[i]; + if (f == fmt::allele || f == fmt::haplotype_dosage) + { + if (ploidy) + { + if ((get_vec(i, data...).size() / sample_size_) != ploidy) + this->output_stream_->setstate(std::ios::failbit); + } + else + { + ploidy = (get_vec(i, data...).size() / sample_size_); + } + } + else if (f == fmt::dosage) + { + if (sample_size_ != get_vec(i, data...).size()) + this->output_stream_->setstate(std::ios::failbit); + } + } + if (good()) { - if (data.size() % sample_size_ != 0) + (*output_stream_) << anno.chromosome() + << "\t" << anno.position() + << "\t" << std::string(anno.prop("ID").size() ? anno.prop("ID") : ".") + << "\t" << anno.ref() + << "\t" << anno.alt() + << "\t" << std::string(anno.prop("QUAL").size() ? anno.prop("QUAL") : ".") + << "\t" << std::string(anno.prop("FILTER").size() ? anno.prop("FILTER") : "."); + + std::size_t i = 0; + for (auto it = info_fields_.begin(); it != info_fields_.end(); ++it) { - output_stream_->setstate(std::ios::failbit); + if (anno.prop(*it).size()) + { + if (i == 0) + (*output_stream_) << "\t"; + else + (*output_stream_) << ";"; + + (*output_stream_) << (*it + "=" + anno.prop(*it)); + + ++i; + } + } + + if (i == 0) + (*output_stream_) << "\t."; + + for (auto it = format_fields_.begin(); it != format_fields_.end(); ++it) + { + if (std::distance(format_fields_.begin(), it) > 0) + output_stream_->put(':'); + (*output_stream_) << (*it == fmt::dosage ? "\tDS" : (*it == fmt::haplotype_dosage ? "\tHDS" : "\tGT")); + } + + if (VecCnt == 1) + { + this->write_single_sample_level_data(ploidy, data...); } else { - (*output_stream_) << anno.chromosome() - << "\t" << anno.locus() - << "\t" << std::string(anno.prop("ID").size() ? anno.prop("ID") : ".") - << "\t" << anno.ref() - << "\t" << anno.alt() - << "\t" << std::string(anno.prop("QUAL").size() ? anno.prop("QUAL") : ".") - << "\t" << std::string(anno.prop("FILTER").size() ? anno.prop("FILTER") : "."); - - std::size_t i = 0; - for (auto it = info_fields_.begin(); it != info_fields_.end(); ++it) + this->write_multi_sample_level_data(ploidy, data...); + } + } + } + + return *this; + } + + template + template + void writer::write_multi_sample_level_data(const std::size_t ploidy, const T&... data) + { + if (this->good()) + { + std::ostreambuf_iterator out_it(*output_stream_); + for (std::size_t sample_index = 0; sample_index < sample_size_; ++sample_index) + { + for (std::size_t format_index = 0; format_index < format_fields_.size(); ++format_index) + { + auto v = get_vec(format_index, data...); + fmt f = format_fields_[format_index]; + if (f == fmt::allele) { - if (anno.prop(*it).size()) + out_it = '\t'; + + std::size_t i = sample_index * ploidy; + if (std::isnan(v[i])) + out_it = '.'; + else if (v[i] == 0.0) + out_it = '0'; + else + out_it = '1'; + + std::size_t end = ploidy + i; + ++i; + for ( ; i < end; ++i) { - if (i == 0) - (*output_stream_) << "\t"; + out_it = '|'; + + if (std::isnan(v[i])) + out_it = '.'; + else if (v[i] == 0.0) + out_it = '0'; else - (*output_stream_) << ";"; + out_it = '1'; + } + } + else if (f == fmt::haplotype_dosage) + { + out_it = '\t'; - (*output_stream_) << (*it + "=" + anno.prop(*it)); + std::size_t i = sample_index * ploidy; + if (std::isnan(v[i])) + out_it = '.'; + else + { + for (const auto c : std::to_string(v[i])) + out_it = c; + } + + std::size_t end = ploidy + i; + ++i; + for ( ; i < end; ++i) + { + out_it = ','; - ++i; + if (std::isnan(v[i])) + out_it = '.'; + else + { + for (const auto c : std::to_string(v[i])) + out_it = c; + } } } + else //if (f == fmt::dosage) + { + out_it = '\t'; - if (i == 0) - (*output_stream_) << "\t."; - - (*output_stream_) << "\tGT"; - write_genotypes(data); - (*output_stream_) << std::endl; + if (std::isnan(v[sample_index])) + out_it = '.'; + else + { + for (const auto c : std::to_string(v[sample_index])) + out_it = c; + } + } } } - return *this; - } - - template - writer& operator<<(const std::tuple>& m) - { - return this->write(std::get<0>(m), std::get<1>(m)); + (*output_stream_) << std::endl; } + } - bool good() { return output_stream_->good(); } - private: - template - void write_genotypes(const std::vector& m) + template + template + void writer::write_single_sample_level_data(const std::size_t ploidy, const T& data) + { + if (good()) { - std::size_t i = 0; - const std::size_t ploidy = m.size() / sample_size_; std::ostreambuf_iterator out_it(*output_stream_); - for (auto it = m.begin(); it != m.end(); ++it) + if (format_fields_[0] == fmt::allele) { - if (i % ploidy == 0) + for (std::size_t sample_index = 0; sample_index < sample_size_; ++sample_index) + { out_it = '\t'; - else - out_it = '|'; - if (std::isnan(*it)) - out_it = '.'; - else if (*it == 0.0) - out_it = '0'; - else - out_it = '1'; + std::size_t i = sample_index * ploidy; + if (std::isnan(data[i])) + out_it = '.'; + else if (data[i] == 0.0) + out_it = '0'; + else + out_it = '1'; + + std::size_t end = ploidy + i; + ++i; + for (; i < end; ++i) + { + out_it = '|'; + + if (std::isnan(data[i])) + out_it = '.'; + else if (data[i] == 0.0) + out_it = '0'; + else + out_it = '1'; + } + } + } + else if (format_fields_[0] == fmt::haplotype_dosage) + { + for (std::size_t sample_index = 0; sample_index < sample_size_; ++sample_index) + { + out_it = '\t'; + + std::size_t i = sample_index * ploidy; + if (std::isnan(data[i])) + out_it = '.'; + else + { + for (const auto c : std::to_string(data[i])) + out_it = c; + } + + std::size_t end = ploidy + i; + ++i; + for (; i < end; ++i) + { + out_it = ','; - ++i; + if (std::isnan(data[i])) + out_it = '.'; + else + { + for (const auto c : std::to_string(data[i])) + out_it = c; + } + } + } } + else //if (f == fmt::dosage) + { + for (std::size_t sample_index = 0; sample_index < sample_size_; ++sample_index) + { + out_it = '\t'; + + if (std::isnan(data[sample_index])) + out_it = '.'; + else + { + for (const auto c : std::to_string(data[sample_index])) + out_it = c; + } + } + } + + (*output_stream_) << std::endl; } - private: - std::vector info_fields_; - std::unique_ptr output_stream_; - std::size_t sample_size_; - }; + } + + template + template + const std::vector& writer::get_vec(std::size_t offset, const std::vector& m) + { + assert(offset == 0); + return m; + } + + template + template + const std::vector& writer::get_vec(std::size_t offset, const std::vector& m, const T2&... other) + { + if ((sizeof...(T2) + 1) - offset == 0) + return m; + else + return get_vec(offset - 1, other...); + } //################################################################// } } diff --git a/include/savvy/test_class.hpp b/include/test/test_class.hpp similarity index 90% rename from include/savvy/test_class.hpp rename to include/test/test_class.hpp index d31ddc2..9221eae 100644 --- a/include/savvy/test_class.hpp +++ b/include/test/test_class.hpp @@ -1,3 +1,9 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + //#ifndef LIBSAVVY_TEST_CLASS_HPP //#define LIBSAVVY_TEST_CLASS_HPP // diff --git a/requirements.txt b/requirements.txt index e707d7f..049f7f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -jonathonl/shrinkwrap@v1.0.0-alpha3 -htslib,https://github.com/samtools/htslib/releases/download/1.3.1/htslib-1.3.1.tar.bz2 --hash md5:16d78f90b72f29971b042e8da8be6843 --cmake dep/htslib.cmake \ No newline at end of file +jonathonl/shrinkwrap@develop +htslib,https://github.com/samtools/htslib/releases/download/1.6/htslib-1.6.tar.bz2 --hash md5:d6fd14e208aca7e08cbe9072233d0af9 --cmake dep/htslib.cmake \ No newline at end of file diff --git a/s1r_spec.md b/s1r_spec.md index 0b30dff..80d7fb3 100644 --- a/s1r_spec.md +++ b/s1r_spec.md @@ -1,57 +1,56 @@ # Sort-tile-recursive One-dimensional R-tree Specification -## Header Format +## Overall Layout +The footer of the index file (the last 26 bytes) should be read first. The footer contains the size in bytes of the chromosome list, which should be read second. An R-tree exists for each chromosome. Trees start from the beginning of the index file in the same order as they appear in the chromosome list. The position of each node of each tree can be inferred from the data contained in the footer and chromosome list. ``` -+----------------------------------------------------------------+ -| "s1r" string in binary + 4 bytes for version (Major.minor) | -+----------------------------------------------------------------+ -| 01110011 00110001 01110010 MMMMMMMM MMMMMMMM mmmmmmmm mmmmmmmm | -+----------------------------------------------------------------+ - -+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| 16 byte UUID used to match index to indexed file | -+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU | -+-------------------------------------------------------------------------------------------------------------------------------------------------+ -``` -``` -+----------+----------+ -| RRRRRRSS | BBBBBBBB | -+----------+----------+ -R: Reserved. -S: Point in region by which to sort entries. (00 for midpoint, 10 for left and 01 for right). -B: Block size (B + 1 Kibibytes). -``` - -Each chromosome is described with the following. -``` -+----------+vvvvvvvvvvv+-------------------------------------------------------------------------+ -| SSSSSSSS | CHROM_STR | RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR | -+----------+vvvvvvvvvvv+-------------------------------------------------------------------------+ -S: Size of chromosome byte array encoded in one byte (1-255 range). A size of 0 terminates the chromosome list. -CHROM_STR: Chromosome byte array of size S bytes. -R: Number of records. -B: Max base pair position. -F: Max file position. ++vvvvvvvvvvvvvvvvvvvvv+--------------------------+ +| Upside Down R-trees | Chromosome List | Footer | ++vvvvvvvvvvvvvvvvvvvvv+--------------------------+ ``` +## R-tree Format +The Upside down R-tree starts with leaf nodes and ends with the root node. Each level of the tree is stored from left to right. All nodes must be full except for the last node at each level. + +The number of nodes at each level and the positions of child nodes are calculated using using the record count and block size. -Current block is then padded with zeros up to the next multiple of 65,536. Following the header is a tree for each chromosome. Trees are in the same order as listed in the header. +Take for example a tree that contains 1,000,000 records with a node size of 4096 bytes. The leaf nodes fit a maximum of 256 entries, while the internal nodes fit 512. 1,000,000 is divided by 256 to get 3,907 leaf nodes. 3,907 is divided by 512 to get 8 internal nodes at the next level down. Since 8 is less than 512, the following level contains the root node. -## Internal Nodes -Internal nodes contain at most (block size / 8) entries. Entries are made up of two big-endian 32-bit integers. The first of which represents the start value of the interval while the second represents the length of the interval. +### Generating the Tree +Trees are created by first sorting all entries by the midpoint of their intervals. The tree is then built from the top down with the intervals of each entry in the internal nodes representing the smallest range that contains the child node's intervals. -## Leaf Nodes +### Leaf Nodes Internal nodes contain at most (block size / 16) entries. Entries are made up of two big-endian 32-bit integers and one 64-bit integer. The first of the 32-bit integers represents the start value of the interval. The second represents the length of the interval. The 64-bit integer represents the offset of the record in the target file. In most cases, the unit for the record offset and length is a byte. -## Tree Structure -The index file begins with the header block followed by the root block. Then each level of the tree going downward is stored from left to right. All nodes must be full except for the last node at each level. - -The number of nodes at each level and the positions of child nodes are calculated using using the record count and block size. +### Internal Nodes +Internal nodes contain at most (block size / 8) entries. Entries are made up of two big-endian 32-bit integers. The first of which represents the start value of the interval while the second represents the length of the interval. -Take for example a tree that contains 1,000,000 records with a node size of 4096 bytes. The leaf nodes fit a maximum of 256 entries, while the internal nodes fit 512. 1,000,000 is divided by 256 to get 3,907 leaf nodes. 3,907 is divided by 512 to get 8 internal nodes at the next level up. Since 8 is less than 512, the following level contains the root node. +## Chromosome List Format +Each chromosome is described with the following. +``` ++vvvvvvvvvvv+-------------------------------------------------------------------------+ +| CHROM_STR | RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR RRRRRRRR | ++vvvvvvvvvvv+-------------------------------------------------------------------------+ +CHROM_STR: Chromosome byte array terminated by null byte. +R: Number of records stored in 8 bytes (big endian). +``` +## Footer Format +``` ++----------+-------------------+ +| BBBBBBBB | CCCCCCCC CCCCCCCC | ++----------+-------------------+ +B: Block size (B + 1 Kibibytes) +C: Chromosome list size in bytes -## Generating the Tree -Trees are created by first sorting all entries by the midpoint of their intervals. The tree is then built from the bottom up with the intervals of each entry in the internal nodes representing the smallest range that contains the child node's intervals. ++-------------------------------------------------------------------------------------------------------------------------------------------------+ +| 16 byte UUID used to match index to indexed file | ++-------------------------------------------------------------------------------------------------------------------------------------------------+ +| UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU UUUUUUUU | ++-------------------------------------------------------------------------------------------------------------------------------------------------+ ++----------------------------------------------------------------+ +| "s1r" string in binary + 4 bytes for version (Major.minor) | ++----------------------------------------------------------------+ +| 01110011 00110001 01110010 MMMMMMMM MMMMMMMM mmmmmmmm mmmmmmmm | ++----------------------------------------------------------------+ +``` ## diff --git a/src/reader.cpp b/src/reader.cpp deleted file mode 100644 index eb30ecb..0000000 --- a/src/reader.cpp +++ /dev/null @@ -1,7 +0,0 @@ - -#include "savvy/reader.hpp" - -namespace savvy -{ - -} \ No newline at end of file diff --git a/src/bcf2m3vcf.cpp b/src/sav/bcf2m3vcf.cpp similarity index 69% rename from src/bcf2m3vcf.cpp rename to src/sav/bcf2m3vcf.cpp index 230cac2..3003733 100644 --- a/src/bcf2m3vcf.cpp +++ b/src/sav/bcf2m3vcf.cpp @@ -1,3 +1,8 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ #include #include "savvy/vcf_reader.hpp" @@ -22,11 +27,11 @@ int main(int argc, char** argv) savvy::m3vcf::block output_block(sample_ids.size(), 2); while (input.good()) { - if (!output_block.add_marker(variant.locus(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end())) + if (!output_block.add_marker(variant.position(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end())) { compact_output << output_block; output_block = savvy::m3vcf::block(sample_ids.size(), 2); - output_block.add_marker(variant.locus(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end()); + output_block.add_marker(variant.position(), variant.ref(), variant.alt(), genotypes.begin(), genotypes.end()); } input.read(variant, genotypes); diff --git a/src/sav/export.cpp b/src/sav/export.cpp new file mode 100644 index 0000000..4a26abc --- /dev/null +++ b/src/sav/export.cpp @@ -0,0 +1,326 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include "sav/export.hpp" +#include "sav/sort.hpp" +#include "sav/utility.hpp" +#include "savvy/vcf_reader.hpp" +#include "savvy/sav_reader.hpp" +#include "savvy/savvy.hpp" + +#include +#include +#include +#include + +class export_prog_args +{ +private: + static const int default_compression_level = 3; + static const int default_block_size = 2048; + + std::vector