diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 73db8fb..f12bb5b 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -4,6 +4,8 @@ name: Main Extension Distribution Pipeline on: push: + branches: + - main pull_request: workflow_dispatch: diff --git a/.github/workflows/_extension_deploy.yml b/.github/workflows/_extension_deploy.yml index 87612dc..f002195 100644 --- a/.github/workflows/_extension_deploy.yml +++ b/.github/workflows/_extension_deploy.yml @@ -22,7 +22,7 @@ on: exclude_archs: required: false type: string - default: "windows_amd64_rtools" + default: "windows_amd64_rtools;wasm_mvp;wasm_eh;wasm_threads" # default: "linux_amd64;linux_amd64_gcc4;osx_arm64;wasm_mvp;wasm_eh;wasm_threads;linux_arm64;osx_amd64;windows_amd64_rtools" # Whether to upload this deployment as the latest. This may overwrite a previous deployment. deploy_latest: diff --git a/.github/workflows/_extension_distribution.yml b/.github/workflows/_extension_distribution.yml index eeaefcb..fe63222 100644 --- a/.github/workflows/_extension_distribution.yml +++ b/.github/workflows/_extension_distribution.yml @@ -24,7 +24,7 @@ on: exclude_archs: required: false type: string - default: "windows_amd64_rtools" + default: "windows_amd64_rtools;wasm_mvp;wasm_eh;wasm_threads" # default: "linux_amd64;linux_amd64_gcc4;osx_arm64;wasm_mvp;wasm_eh;wasm_threads;linux_arm64;osx_amd64;windows_amd64_rtools" # Postfix added to artifact names. Can be used to guarantee unique names when this workflow is called multiple times artifact_postfix: diff --git a/.vscode/settings.json b/.vscode/settings.json index 74231f3..997cde6 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -93,6 +93,8 @@ "*.tcc": "cpp", "memory_resource": "cpp", "string_view": "cpp", - "shared_mutex": "cpp" + "shared_mutex": "cpp", + "any": "cpp", + "valarray": "cpp" } } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 07e1221..29df1f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ set(TARGET_NAME pbix) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) + project(${TARGET_NAME}) include_directories(src/include) include_directories(src/abf) diff --git a/extension_config.cmake b/extension_config.cmake index bf04bf9..3cf1192 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -7,4 +7,6 @@ duckdb_extension_load(pbix ) # Any extra extensions that should be built -# e.g.: duckdb_extension_load(json) \ No newline at end of file +# e.g.: +# duckdb_extension_load(json) +# duckdb_extension_load(httpfs) \ No newline at end of file diff --git a/ksy/zip_central_dir.ksy b/ksy/zip_central_dir.ksy new file mode 100644 index 0000000..2f59695 --- /dev/null +++ b/ksy/zip_central_dir.ksy @@ -0,0 +1,91 @@ +meta: + id: pbix + title: PBIX archive file + file-extension: pbix + endian: le + bit-endian: le +seq: + - id: sections + type: pk_section + repeat: eos +types: + pk_section: + seq: + - id: magic + contents: 'PK' + - id: section_type + type: u2 + - id: body + type: + switch-on: section_type + cases: + 0x0201: central_dir_entry + 0x0403: local_file + 0x0605: end_of_central_dir + 0x0807: data_descriptor + data_descriptor: + seq: + - id: data_descriptor_obs + size: 12 + local_file: + seq: + - id: header + type: local_file_header + - id: body + size: header.len_body_compressed + local_file_header: + seq: + - id: header_trimmed + size: 14 + - id: len_body_compressed + type: u4 + - id: len_body_uncompressed + type: u4 + - id: len_file_name + type: u2 + - id: len_extra + type: u2 + - id: file_name + size: len_file_name + - id: extra + size: len_extra + central_dir_entry: + seq: + - id: header_obs + size: 12 + - id: crc32 + type: u4 + - id: len_body_compressed + type: u4 + - id: len_body_uncompressed + type: u4 + - id: len_file_name + type: u2 + - id: len_extra + type: u2 + - id: len_comment + type: u2 + - id: disk_number_start + type: u2 + - id: int_file_attr + type: u2 + - id: ext_file_attr + type: u4 + - id: ofs_local_header + type: s4 + - id: file_name + type: str + size: len_file_name + encoding: UTF-8 + - id: extra + size: len_extra + - id: comment + size: len_comment + end_of_central_dir: + seq: + - id: header_obs + size: 16 + - id: len_comment + type: u2 + - id: comment + size: len_comment diff --git a/src/abf/AbfParser.cpp b/src/abf/AbfParser.cpp index 421a469..f54d1be 100644 --- a/src/abf/AbfParser.cpp +++ b/src/abf/AbfParser.cpp @@ -1,6 +1,9 @@ #include "AbfParser.h" +// #include "duckdb/common/file_system.hpp" + using namespace tinyxml2; +using namespace duckdb; std::vector AbfParser::read_buffer_bytes(const std::vector &buffer, uint64_t offset, int size) { @@ -109,6 +112,49 @@ std::pair AbfParser::initialize_zip_and_locate_datamodel(con return {file_stat.m_local_header_ofs, file_stat.m_comp_size}; } + +std::pair AbfParser::locate_datamodel(duckdb::FileHandle &file_handle_p, const std::string &path) { + constexpr auto DataModelFileName = "DataModel"; + mz_zip_archive zip_archive; + memset(&zip_archive, 0, sizeof(zip_archive)); + + + // Setup the custom IO operations + zip_archive.m_pIO_opaque = &file_handle_p; + zip_archive.m_pRead = [](void *opaque, mz_uint64 file_offset, void *buffer, size_t n) { + auto handle = static_cast(opaque); + handle->Seek(file_offset); + return static_cast(handle->Read(buffer, n)); + }; + + // Initialize the zip archive for reading using the custom IO + if (!mz_zip_reader_init(&zip_archive, file_handle_p.GetFileSize(), MZ_ZIP_FLAG_COMPRESSED_DATA)) { // Note: MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY might be needed depending on use case + throw std::runtime_error("Could not initialize zip reader"); + } + + // Locate the DataModel file within the zip + int file_index = mz_zip_reader_locate_file(&zip_archive, DataModelFileName, nullptr, 0); + if (file_index < 0) { + mz_zip_reader_end(&zip_archive); // Clean up before throwing + throw std::runtime_error("DataModel not found in the zip file."); + } + + // Retrieve information about the DataModel file + mz_zip_archive_file_stat file_stat; + if (!mz_zip_reader_file_stat(&zip_archive, file_index, &file_stat)) { + mz_zip_reader_end(&zip_archive); // Clean up before throwing + throw std::runtime_error("Could not retrieve information about DataModel."); + } + + // // Clean up the zip reader as it's no longer needed after getting the info + mz_zip_reader_end(&zip_archive); + // file_handle = file_handle_p.release(); + + // Return the offset and compressed size of the DataModel file + return {file_stat.m_local_header_ofs, file_stat.m_comp_size}; +} + + void AbfParser::read_compressed_datamodel_header(std::ifstream &entryStream, uint64_t &datamodel_ofs) { // Read compressed DataModel header to adjust offset entryStream.seekg(datamodel_ofs+ZIP_LOCAL_FILE_HEADER_FIXED); @@ -122,7 +168,7 @@ void AbfParser::read_compressed_datamodel_header(std::ifstream &entryStream, uin std::vector AbfParser::decompress_initial_block(std::ifstream &entryStream, uint64_t datamodel_ofs, XPress9Wrapper &xpress9_wrapper) { // Seek to the start of the DataModel compressed data - entryStream.seekg(datamodel_ofs + ABF_XPRESS9_SIGNATRUE, std::ios::beg); + entryStream.seekg(datamodel_ofs + ABF_XPRESS9_SIGNATURE, std::ios::beg); uint32_t uncompressed_size; uint32_t compressed_size; @@ -144,10 +190,39 @@ std::vector AbfParser::decompress_initial_block(std::ifstream &entryStr } return decompressed_buffer; } +std::vector AbfParser::decompress_initial_block(duckdb::FileHandle &file_handle_p, uint64_t &bytes_read,XPress9Wrapper &xpress9_wrapper) { + // Seek to the start of the DataModel compressed data + std::vector signature(ABF_XPRESS9_SIGNATURE); + file_handle_p.Read(reinterpret_cast(signature.data()),ABF_XPRESS9_SIGNATURE); + + bytes_read += ABF_XPRESS9_SIGNATURE; + + uint32_t uncompressed_size; + uint32_t compressed_size; + // Read the compressed and uncompressed sizes before the offset + file_handle_p.Read(reinterpret_cast(&uncompressed_size), sizeof(uint32_t)); + file_handle_p.Read(reinterpret_cast(&compressed_size), sizeof(uint32_t)); + bytes_read += sizeof(uint32_t) + sizeof(uint32_t); + + // Allocate buffers for compressed and decompressed data + std::vector decompressed_buffer(uncompressed_size); + std::vector compressed_buffer(compressed_size); + + file_handle_p.Read(reinterpret_cast(compressed_buffer.data()), compressed_size); + bytes_read += compressed_size; + + // Decompress the entire data + uint32_t decompressed_size = xpress9_wrapper.Decompress(compressed_buffer.data(), compressed_size, decompressed_buffer.data(), decompressed_buffer.size()); + // Verify that the total decompressed size matches the expected size + if (decompressed_size != uncompressed_size) { + throw std::runtime_error("Mismatch in decompressed block size in first block."); + } + // file_handle = file_handle_p.release(); + return decompressed_buffer; +} std::vector AbfParser::iterate_and_decompress_blocks(std::ifstream &entryStream, uint64_t datamodel_ofs, uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset) { // Calculate the total number of blocks - constexpr uint32_t BLOCK_SIZE = 0x200000; auto total_blocks = (virtual_directory_size + virtual_directory_offset) / BLOCK_SIZE; std::vector all_decompressed_data; @@ -195,6 +270,61 @@ std::vector AbfParser::iterate_and_decompress_blocks(std::ifstream &ent return all_decompressed_data; } +std::vector AbfParser::iterate_and_decompress_blocks(duckdb::FileHandle &file_handle_p, uint64_t &bytes_read, uint64_t datamodel_ofs,uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset) { + // Calculate the total number of blocks + + auto total_blocks = (virtual_directory_size + virtual_directory_offset) / BLOCK_SIZE; + + std::vector all_decompressed_data; + uint32_t block_index = 0; + uint32_t block_index_iterator = 0; + + // Iterate through each block in the DataModel + while (bytes_read < datamodel_size) { + block_index++; + // Read the compressed and uncompressed sizes + uint32_t uncompressed_size = 0; + uint32_t compressed_size = 0; + file_handle_p.Read(reinterpret_cast(&uncompressed_size), sizeof(uncompressed_size)); + file_handle_p.Read(reinterpret_cast(&compressed_size), sizeof(compressed_size)); + bytes_read += sizeof(uncompressed_size) + sizeof(compressed_size); + + // Skip blocks if not within the last `trailing_blocks` (based on your logic) + if (total_blocks > trailing_blocks && block_index < (total_blocks - trailing_blocks)) { + skip_offset += uncompressed_size; + bytes_read += compressed_size; + file_handle_p.Seek(datamodel_ofs+bytes_read); // Skip this block + continue; + } + + // Allocate buffers for the compressed and decompressed data + std::vector compressed_buffer(compressed_size); + std::vector decompressed_buffer(uncompressed_size); + + // Read the compressed block + file_handle_p.Read(reinterpret_cast(compressed_buffer.data()), compressed_size); + bytes_read += compressed_size; + + // call to a new function process header_buffer which we'll use to modify compressed_buffer + patch_header_of_compressed_buffer(compressed_buffer, block_index_iterator); + + // Decompress the block + uint32_t decompressed_size = xpress9_wrapper.Decompress(compressed_buffer.data(), compressed_size, decompressed_buffer.data(), decompressed_buffer.size()); + + // Verify decompression success + if (decompressed_size != uncompressed_size) { + throw std::runtime_error("Decompression failed or resulted in unexpected size."); + } + + // Add decompressed data to the overall buffer + all_decompressed_data.insert(all_decompressed_data.end(), decompressed_buffer.begin(), decompressed_buffer.end()); + } + + // file_handle = file_handle_p.release(); + + return all_decompressed_data; +} + std::vector AbfParser::get_sqlite(const std::string &path, const int trailing_blocks=15) { // Initialize zip and locate DataModel @@ -228,6 +358,55 @@ std::vector AbfParser::get_sqlite(const std::string &path, const int tr // Prefix all_decompressed_buffer with initial_decompressed_buffer in case we have only one block all_decompressed_buffer.insert(all_decompressed_buffer.begin(), initial_decompressed_buffer.begin(), initial_decompressed_buffer.end()); + if (skip_offset + all_decompressed_buffer.size() < virtual_directory_offset + virtual_directory_size) + { + throw std::runtime_error("Could not parse the entire DataModel."); + } + // Finally, extract the SQLite buffer from the decompressed data + return extract_sqlite_buffer(all_decompressed_buffer, skip_offset, virtual_directory_offset, virtual_directory_size); +} +std::vector AbfParser::get_sqlite_v2(duckdb::ClientContext &context, const std::string &path, const int trailing_blocks=15) +{ + auto &fs = duckdb::FileSystem::GetFileSystem(context); + // Open the file using FileSystem + auto file_handle = fs.OpenFile(path, FILE_READ); + if (!file_handle) { + throw std::runtime_error("Could not open zip file"); + } + + auto [datamodel_ofs, datamodel_size] = locate_datamodel(*file_handle, path); + uint64_t bytes_read = 0; + uint16_t zip_pointer = 0; + + // Read compressed DataModel header to adjust offset + file_handle->Seek(datamodel_ofs+ZIP_LOCAL_FILE_HEADER_FIXED); + uint16_t filename_len = 0; + uint16_t extra_len = 0; + file_handle->Read(reinterpret_cast(&filename_len), sizeof(filename_len)); + file_handle->Read(reinterpret_cast(&extra_len), sizeof(extra_len)); + datamodel_ofs += ZIP_LOCAL_FILE_HEADER + filename_len + extra_len; + + file_handle->Seek(datamodel_ofs); + + XPress9Wrapper xpress9_wrapper; + if (!xpress9_wrapper.Initialize()) + { + throw std::runtime_error("Failed to initialize XPress9Wrapper"); + } + + // Decompress initial block to get the virtual directory info + auto initial_decompressed_buffer = decompress_initial_block(*file_handle, bytes_read,xpress9_wrapper); + + // Process backup log header to get virtual directory offset and size + auto [virtual_directory_offset, virtual_directory_size] = process_backup_log_header(initial_decompressed_buffer); + + uint64_t skip_offset = 0; //optimization for skipping blocks + // Iterate through the remaining blocks and decompress them + auto all_decompressed_buffer = iterate_and_decompress_blocks(*file_handle, bytes_read, datamodel_ofs, datamodel_size, xpress9_wrapper, virtual_directory_offset, virtual_directory_size, trailing_blocks, skip_offset); + + // Prefix all_decompressed_buffer with initial_decompressed_buffer in case we have only one block + all_decompressed_buffer.insert(all_decompressed_buffer.begin(), initial_decompressed_buffer.begin(), initial_decompressed_buffer.end()); + if (skip_offset + all_decompressed_buffer.size() < virtual_directory_offset + virtual_directory_size) { throw std::runtime_error("Could not parse the entire DataModel."); diff --git a/src/abf/AbfParser.h b/src/abf/AbfParser.h index c7c4589..2910b48 100644 --- a/src/abf/AbfParser.h +++ b/src/abf/AbfParser.h @@ -17,27 +17,38 @@ #include "Xpress9Wrapper.h" #include "Crc32.h" +#include "duckdb.hpp" + + // Constants related to ZIP file parsing constexpr unsigned char ZIP_LOCAL_FILE_HEADER_FIXED = 26; constexpr unsigned char ZIP_LOCAL_FILE_HEADER = 30; -constexpr unsigned char ABF_XPRESS9_SIGNATRUE = 102; +constexpr unsigned char ABF_XPRESS9_SIGNATURE = 102; constexpr unsigned char ABF_BACKUP_LOG_HEADER_OFFSET = 72; +constexpr uint32_t BLOCK_SIZE = 0x200000; constexpr unsigned short ABF_BACKUP_LOG_HEADER_SIZE = 0x1000 - ABF_BACKUP_LOG_HEADER_OFFSET; +static constexpr idx_t FILE_READ = idx_t(1 << 0); class AbfParser { public: static std::vector get_sqlite(const std::string &path, const int trailing_chunks); + static std::vector get_sqlite_v2(duckdb::ClientContext &context,const std::string &path, const int trailing_chunks); private: + // duckdb::FileHandle *file_handle; + // mz_zip_archive zip_archive; static void patch_header_of_compressed_buffer(std::vector &compressed_buffer, uint32_t& block_index_iterator); static std::vector read_buffer_bytes(const std::vector& buffer, uint64_t offset, int size); static std::vector trim_buffer(const std::vector& buffer); static std::tuple process_backup_log_header(const std::vector &buffer); static std::vector extract_sqlite_buffer(const std::vector &buffer, uint64_t skip_offset, uint64_t virtual_directory_offset, int virtual_directory_size); static std::pair initialize_zip_and_locate_datamodel(const std::string &path); + static std::pair locate_datamodel(duckdb::FileHandle &file_handle, const std::string &path); static void read_compressed_datamodel_header(std::ifstream &entryStream, uint64_t &datamodel_ofs); static std::vector decompress_initial_block(std::ifstream &entryStream, uint64_t datamodel_ofs, XPress9Wrapper &xpress9_wrapper); + static std::vector decompress_initial_block(duckdb::FileHandle &file_handle, uint64_t &bytes_read, XPress9Wrapper &xpress9_wrapper); static std::vector iterate_and_decompress_blocks(std::ifstream &entryStream, uint64_t datamodel_ofs, uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset); + static std::vector iterate_and_decompress_blocks(duckdb::FileHandle &file_handle, uint64_t &bytes_read, uint64_t datamodel_ofs, uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset); }; class Header { diff --git a/src/abf/ZipUtils.cpp b/src/abf/ZipUtils.cpp new file mode 100644 index 0000000..d4c1a10 --- /dev/null +++ b/src/abf/ZipUtils.cpp @@ -0,0 +1,56 @@ +#include "ZipUtils.h" +#include +#include +#include +#include +#include + +class ZipUtils { +public: + static bool findEndOfCentralDirectory(std::istream& stream, EndOfCentralDirectoryRecord& eocd) { + const uint32_t signatureEOCD = 0x06054b50; + std::vector buffer(4096); + stream.seekg(0, std::ios::end); + std::streampos fileSize = stream.tellg(); + int64_t searchOffset = std::min(static_cast(fileSize), static_cast(buffer.size())); + + stream.seekg(-searchOffset, std::ios::end); + stream.read(buffer.data(), searchOffset); + auto foundPos = std::search(buffer.rbegin(), buffer.rend(), reinterpret_cast(&signatureEOCD), reinterpret_cast(&signatureEOCD) + sizeof(signatureEOCD)); + + if (foundPos != buffer.rend()) { + size_t offset = std::distance(buffer.begin(), foundPos.base()) - sizeof(signatureEOCD); + stream.seekg(-searchOffset + offset, std::ios::end); + stream.read(reinterpret_cast(&eocd), sizeof(eocd)); + return true; + } + return false; + } + + static std::pair findDataModel(std::istream& zipStream) { + EndOfCentralDirectoryRecord eocd; + if (!findEndOfCentralDirectory(zipStream, eocd)) { + throw std::runtime_error("End of central directory not found."); + } + + zipStream.seekg(eocd.centralDirectoryOffset, std::ios::beg); + CentralDirectoryFileHeader cdHeader; + + for (int i = 0; i < eocd.numEntries; ++i) { + zipStream.read(reinterpret_cast(&cdHeader), sizeof(cdHeader)); + if (cdHeader.signature != 0x02014b50) { + throw std::runtime_error("Invalid central directory file header signature."); + } + + std::vector filename(cdHeader.fileNameLength); + zipStream.read(filename.data(), cdHeader.fileNameLength); + zipStream.ignore(cdHeader.extraFieldLength + cdHeader.fileCommentLength); + + if (std::string(filename.begin(), filename.end()) == "DataModel") { + return {cdHeader.localHeaderOffset, cdHeader.compressedSize}; + } + } + + throw std::runtime_error("DataModel not found in the zip file."); + } +}; diff --git a/src/abf/ZipUtils.h b/src/abf/ZipUtils.h new file mode 100644 index 0000000..57e4f7a --- /dev/null +++ b/src/abf/ZipUtils.h @@ -0,0 +1,48 @@ +#ifndef ZIP_UTILS_H +#define ZIP_UTILS_H + +#include +#include +#include + +// Structure to store the end of central directory record. +struct EndOfCentralDirectoryRecord { + uint32_t signature; + uint16_t diskNumber; + uint16_t centralDirectoryDiskNumber; + uint16_t numEntriesThisDisk; + uint16_t numEntries; + uint32_t centralDirectorySize; + uint32_t centralDirectoryOffset; + uint16_t commentLength; +}; + +// Structure to store the central directory file header. +struct CentralDirectoryFileHeader { + uint32_t signature; + uint16_t versionMadeBy; + uint16_t versionNeededToExtract; + uint16_t generalPurposeBitFlag; + uint16_t compressionMethod; + uint16_t lastModFileTime; + uint16_t lastModFileDate; + uint32_t crc32; + uint32_t compressedSize; + uint32_t uncompressedSize; + uint16_t fileNameLength; + uint16_t extraFieldLength; + uint16_t fileCommentLength; + uint16_t diskNumberStart; + uint16_t internalFileAttributes; + uint32_t externalFileAttributes; + uint32_t localHeaderOffset; +}; + +// Class containing static utility methods for working with ZIP files. +class ZipUtils { +public: + static bool findEndOfCentralDirectory(std::istream& stream, EndOfCentralDirectoryRecord& eocd); + static std::pair findDataModel(std::istream& zipStream); +}; + +#endif // ZIP_UTILS_H diff --git a/src/include/sqlite_db.hpp b/src/include/sqlite_db.hpp index 0e5308e..e5e7fbe 100644 --- a/src/include/sqlite_db.hpp +++ b/src/include/sqlite_db.hpp @@ -32,7 +32,7 @@ class SQLiteDB { public: static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); - static SQLiteDB OpenFromBuffer(const string &path, const SQLiteOpenOptions &options,const std::vector &buffer); + static SQLiteDB OpenFromBuffer(const SQLiteOpenOptions &options,const std::vector &buffer); bool TryPrepare(const string &query, SQLiteStatement &result); SQLiteStatement Prepare(const string &query); void Execute(const string &query); diff --git a/src/pbix_scanner.cpp b/src/pbix_scanner.cpp index 6122aab..261f3b1 100644 --- a/src/pbix_scanner.cpp +++ b/src/pbix_scanner.cpp @@ -51,8 +51,8 @@ struct PbixGlobalState : public GlobalTableFunctionState { static SQLiteDB ExtractDB(ClientContext &context, const string &path, int trailing_chunks) { SQLiteOpenOptions options; - auto sqliteBuffer = AbfParser::get_sqlite(path, trailing_chunks); - return SQLiteDB::OpenFromBuffer(path, options, sqliteBuffer); + auto sqliteBuffer = AbfParser::get_sqlite_v2(context, path, trailing_chunks); + return SQLiteDB::OpenFromBuffer(options, sqliteBuffer); } diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index ca2d947..2f083eb 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -64,7 +64,7 @@ SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bo return result; } -SQLiteDB SQLiteDB::OpenFromBuffer(const string &path, const SQLiteOpenOptions &options, const std::vector &buffer){ +SQLiteDB SQLiteDB::OpenFromBuffer(const SQLiteOpenOptions &options, const std::vector &buffer){ SQLiteDB result; result.dbBuffer = buffer; // buffer for in-memory database // int flags = SQLITE_OPEN_SHAREDCACHE | SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_MEMORY; diff --git a/third_party/kaitai/custom_decoder.h b/third_party/kaitai/custom_decoder.h new file mode 100644 index 0000000..6da7f5f --- /dev/null +++ b/third_party/kaitai/custom_decoder.h @@ -0,0 +1,16 @@ +#ifndef KAITAI_CUSTOM_DECODER_H +#define KAITAI_CUSTOM_DECODER_H + +#include + +namespace kaitai { + +class custom_decoder { +public: + virtual ~custom_decoder() {}; + virtual std::string decode(std::string src) = 0; +}; + +} + +#endif diff --git a/third_party/kaitai/exceptions.h b/third_party/kaitai/exceptions.h new file mode 100644 index 0000000..1c1e414 --- /dev/null +++ b/third_party/kaitai/exceptions.h @@ -0,0 +1,224 @@ +#ifndef KAITAI_EXCEPTIONS_H +#define KAITAI_EXCEPTIONS_H + +#include + +#include +#include + +// We need to use "noexcept" in virtual destructor of our exceptions +// subclasses. Different compilers have different ideas on how to +// achieve that: C++98 compilers prefer `throw()`, C++11 and later +// use `noexcept`. We define KS_NOEXCEPT macro for that. + +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) +#define KS_NOEXCEPT noexcept +#else +#define KS_NOEXCEPT throw() +#endif + +namespace kaitai { + +/** + * Common ancestor for all errors related to `bytes_to_str` operation. Also used + * to signal misc non-specific `bytes_to_str` failures. + */ +class bytes_to_str_error: public std::runtime_error { +public: + bytes_to_str_error(const std::string what): + std::runtime_error(std::string("bytes_to_str error: ") + what) {} + + virtual ~bytes_to_str_error() KS_NOEXCEPT {}; +}; + +/** + * Exception to signal that `bytes_to_str` operation was requested to use some encoding + * that is not available in given runtime environment. + */ +class unknown_encoding: public bytes_to_str_error { +public: + unknown_encoding(const std::string enc_name): + bytes_to_str_error(std::string("unknown encoding: `") + enc_name + std::string("`")) {} + + virtual ~unknown_encoding() KS_NOEXCEPT {}; +}; + +/** + * Exception to signal that `bytes_to_str` operation failed to decode given byte sequence. + */ +class illegal_seq_in_encoding: public bytes_to_str_error { +public: + illegal_seq_in_encoding(const std::string what): + bytes_to_str_error("illegal sequence: " + what) {} + + virtual ~illegal_seq_in_encoding() KS_NOEXCEPT {}; +}; + +/** + * Common ancestor for all error originating from Kaitai Struct usage. + * Stores KSY source path, pointing to an element supposedly guilty of + * an error. + */ +class kstruct_error: public std::runtime_error { +public: + kstruct_error(const std::string what, const std::string src_path): + std::runtime_error(src_path + ": " + what), + m_src_path(src_path) + { + } + + virtual ~kstruct_error() KS_NOEXCEPT {}; + +protected: + const std::string m_src_path; +}; + +/** + * Error that occurs when default endianness should be decided with + * a switch, but nothing matches (although using endianness expression + * implies that there should be some positive result). + */ +class undecided_endianness_error: public kstruct_error { +public: + undecided_endianness_error(const std::string src_path): + kstruct_error("unable to decide on endianness for a type", src_path) + { + } + + virtual ~undecided_endianness_error() KS_NOEXCEPT {}; +}; + +/** + * Common ancestor for all validation failures. Stores pointer to + * KaitaiStream IO object which was involved in an error. + */ +class validation_failed_error: public kstruct_error { +public: + validation_failed_error(const std::string what, kstream* io, const std::string src_path): + kstruct_error("at pos " + kstream::to_string(io->pos()) + ": validation failed: " + what, src_path), + m_io(io) + { + } + +// "at pos #{io.pos}: validation failed: #{msg}" + + virtual ~validation_failed_error() KS_NOEXCEPT {}; + +protected: + kstream* m_io; +}; + +/** + * Signals validation failure: we required "actual" value to be equal to + * "expected", but it turned out that it's not. + */ +template +class validation_not_equal_error: public validation_failed_error { +public: + validation_not_equal_error(const T& expected, const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not equal", io, src_path), + m_expected(expected), + m_actual(actual) + { + } + + // "not equal, expected #{expected.inspect}, but got #{actual.inspect}" + + virtual ~validation_not_equal_error() KS_NOEXCEPT {}; + +protected: + const T& m_expected; + const T& m_actual; +}; + +/** + * Signals validation failure: we required "actual" value to be greater + * than or equal to "min", but it turned out that it's not. + */ +template +class validation_less_than_error: public validation_failed_error { +public: + validation_less_than_error(const T& min, const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not in range", io, src_path), + m_min(min), + m_actual(actual) + { + } + + // "not in range, min #{min.inspect}, but got #{actual.inspect}" + + virtual ~validation_less_than_error() KS_NOEXCEPT {}; + +protected: + const T& m_min; + const T& m_actual; +}; + +/** + * Signals validation failure: we required "actual" value to be less + * than or equal to "max", but it turned out that it's not. + */ +template +class validation_greater_than_error: public validation_failed_error { +public: + validation_greater_than_error(const T& max, const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not in range", io, src_path), + m_max(max), + m_actual(actual) + { + } + + // "not in range, max #{max.inspect}, but got #{actual.inspect}" + + virtual ~validation_greater_than_error() KS_NOEXCEPT {}; + +protected: + const T& m_max; + const T& m_actual; +}; + +/** + * Signals validation failure: we required "actual" value to be from + * the list, but it turned out that it's not. + */ +template +class validation_not_any_of_error: public validation_failed_error { +public: + validation_not_any_of_error(const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not any of the list", io, src_path), + m_actual(actual) + { + } + + // "not any of the list, got #{actual.inspect}" + + virtual ~validation_not_any_of_error() KS_NOEXCEPT {}; + +protected: + const T& m_actual; +}; + +/** + * Signals validation failure: we required "actual" value to match + * the expression, but it turned out that it doesn't. + */ +template +class validation_expr_error: public validation_failed_error { +public: + validation_expr_error(const T& actual, kstream* io, const std::string src_path): + validation_failed_error("not matching the expression", io, src_path), + m_actual(actual) + { + } + + // "not matching the expression, got #{actual.inspect}" + + virtual ~validation_expr_error() KS_NOEXCEPT {}; + +protected: + const T& m_actual; +}; + +} + +#endif diff --git a/third_party/kaitai/kaitaistream.cpp b/third_party/kaitai/kaitaistream.cpp new file mode 100644 index 0000000..f3d95eb --- /dev/null +++ b/third_party/kaitai/kaitaistream.cpp @@ -0,0 +1,877 @@ +#include +#include + +#if defined(__APPLE__) +#include +#include +#define bswap_16(x) OSSwapInt16(x) +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) +#define __BYTE_ORDER BYTE_ORDER +#define __BIG_ENDIAN BIG_ENDIAN +#define __LITTLE_ENDIAN LITTLE_ENDIAN +#elif defined(_MSC_VER) // !__APPLE__ +#include +#define __LITTLE_ENDIAN 1234 +#define __BIG_ENDIAN 4321 +#define __BYTE_ORDER __LITTLE_ENDIAN +#define bswap_16(x) _byteswap_ushort(x) +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) +#elif defined(__QNX__) // __QNX__ +#include +#include +#define bswap_16(x) ENDIAN_RET16(x) +#define bswap_32(x) ENDIAN_RET32(x) +#define bswap_64(x) ENDIAN_RET64(x) +#define __BYTE_ORDER BYTE_ORDER +#define __BIG_ENDIAN BIG_ENDIAN +#define __LITTLE_ENDIAN LITTLE_ENDIAN +#else // !__APPLE__ or !_MSC_VER or !__QNX__ +#include +#include +#endif + +#include +#include +#include + +kaitai::kstream::kstream(std::istream *io) { + m_io = io; + init(); +} + +kaitai::kstream::kstream(const std::string &data) : m_io_str(data) { + m_io = &m_io_str; + init(); +} + +void kaitai::kstream::init() { + exceptions_enable(); + align_to_byte(); +} + +void kaitai::kstream::close() { + // m_io->close(); +} + +void kaitai::kstream::exceptions_enable() const { + m_io->exceptions( + std::istream::eofbit | + std::istream::failbit | + std::istream::badbit + ); +} + +// ======================================================================== +// Stream positioning +// ======================================================================== + +bool kaitai::kstream::is_eof() const { + if (m_bits_left > 0) { + return false; + } + char t; + m_io->exceptions(std::istream::badbit); + m_io->get(t); + if (m_io->eof()) { + m_io->clear(); + exceptions_enable(); + return true; + } else { + m_io->unget(); + exceptions_enable(); + return false; + } +} + +void kaitai::kstream::seek(uint64_t pos) { + m_io->seekg(pos); +} + +uint64_t kaitai::kstream::pos() { + return m_io->tellg(); +} + +uint64_t kaitai::kstream::size() { + std::iostream::pos_type cur_pos = m_io->tellg(); + m_io->seekg(0, std::ios::end); + std::iostream::pos_type len = m_io->tellg(); + m_io->seekg(cur_pos); + return len; +} + +// ======================================================================== +// Integer numbers +// ======================================================================== + +// ------------------------------------------------------------------------ +// Signed +// ------------------------------------------------------------------------ + +int8_t kaitai::kstream::read_s1() { + char t; + m_io->get(t); + return t; +} + +// ........................................................................ +// Big-endian +// ........................................................................ + +int16_t kaitai::kstream::read_s2be() { + int16_t t; + m_io->read(reinterpret_cast(&t), 2); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_16(t); +#endif + return t; +} + +int32_t kaitai::kstream::read_s4be() { + int32_t t; + m_io->read(reinterpret_cast(&t), 4); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_32(t); +#endif + return t; +} + +int64_t kaitai::kstream::read_s8be() { + int64_t t; + m_io->read(reinterpret_cast(&t), 8); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_64(t); +#endif + return t; +} + +// ........................................................................ +// Little-endian +// ........................................................................ + +int16_t kaitai::kstream::read_s2le() { + int16_t t; + m_io->read(reinterpret_cast(&t), 2); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_16(t); +#endif + return t; +} + +int32_t kaitai::kstream::read_s4le() { + int32_t t; + m_io->read(reinterpret_cast(&t), 4); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_32(t); +#endif + return t; +} + +int64_t kaitai::kstream::read_s8le() { + int64_t t; + m_io->read(reinterpret_cast(&t), 8); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_64(t); +#endif + return t; +} + +// ------------------------------------------------------------------------ +// Unsigned +// ------------------------------------------------------------------------ + +uint8_t kaitai::kstream::read_u1() { + char t; + m_io->get(t); + return t; +} + +// ........................................................................ +// Big-endian +// ........................................................................ + +uint16_t kaitai::kstream::read_u2be() { + uint16_t t; + m_io->read(reinterpret_cast(&t), 2); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_16(t); +#endif + return t; +} + +uint32_t kaitai::kstream::read_u4be() { + uint32_t t; + m_io->read(reinterpret_cast(&t), 4); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_32(t); +#endif + return t; +} + +uint64_t kaitai::kstream::read_u8be() { + uint64_t t; + m_io->read(reinterpret_cast(&t), 8); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_64(t); +#endif + return t; +} + +// ........................................................................ +// Little-endian +// ........................................................................ + +uint16_t kaitai::kstream::read_u2le() { + uint16_t t; + m_io->read(reinterpret_cast(&t), 2); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_16(t); +#endif + return t; +} + +uint32_t kaitai::kstream::read_u4le() { + uint32_t t; + m_io->read(reinterpret_cast(&t), 4); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_32(t); +#endif + return t; +} + +uint64_t kaitai::kstream::read_u8le() { + uint64_t t; + m_io->read(reinterpret_cast(&t), 8); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_64(t); +#endif + return t; +} + +// ======================================================================== +// Floating point numbers +// ======================================================================== + +// ........................................................................ +// Big-endian +// ........................................................................ + +float kaitai::kstream::read_f4be() { + uint32_t t; + m_io->read(reinterpret_cast(&t), 4); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_32(t); +#endif + return reinterpret_cast(t); +} + +double kaitai::kstream::read_f8be() { + uint64_t t; + m_io->read(reinterpret_cast(&t), 8); +#if __BYTE_ORDER == __LITTLE_ENDIAN + t = bswap_64(t); +#endif + return reinterpret_cast(t); +} + +// ........................................................................ +// Little-endian +// ........................................................................ + +float kaitai::kstream::read_f4le() { + uint32_t t; + m_io->read(reinterpret_cast(&t), 4); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_32(t); +#endif + return reinterpret_cast(t); +} + +double kaitai::kstream::read_f8le() { + uint64_t t; + m_io->read(reinterpret_cast(&t), 8); +#if __BYTE_ORDER == __BIG_ENDIAN + t = bswap_64(t); +#endif + return reinterpret_cast(t); +} + +// ======================================================================== +// Unaligned bit values +// ======================================================================== + +void kaitai::kstream::align_to_byte() { + m_bits_left = 0; + m_bits = 0; +} + +uint64_t kaitai::kstream::read_bits_int_be(int n) { + uint64_t res = 0; + + int bits_needed = n - m_bits_left; + m_bits_left = -bits_needed & 7; // `-bits_needed mod 8` + + if (bits_needed > 0) { + // 1 bit => 1 byte + // 8 bits => 1 byte + // 9 bits => 2 bytes + int bytes_needed = ((bits_needed - 1) / 8) + 1; // `ceil(bits_needed / 8)` + if (bytes_needed > 8) + throw std::runtime_error("read_bits_int_be: more than 8 bytes requested"); + uint8_t buf[8]; + m_io->read(reinterpret_cast(buf), bytes_needed); + for (int i = 0; i < bytes_needed; i++) { + res = res << 8 | buf[i]; + } + + uint64_t new_bits = res; + res = res >> m_bits_left | (bits_needed < 64 ? m_bits << bits_needed : 0); // avoid undefined behavior of `x << 64` + m_bits = new_bits; // will be masked at the end of the function + } else { + res = m_bits >> -bits_needed; // shift unneeded bits out + } + + uint64_t mask = (static_cast(1) << m_bits_left) - 1; // `m_bits_left` is in range 0..7, so `(1 << 64)` does not have to be considered + m_bits &= mask; + + return res; +} + +// Deprecated, use read_bits_int_be() instead. +uint64_t kaitai::kstream::read_bits_int(int n) { + return read_bits_int_be(n); +} + +uint64_t kaitai::kstream::read_bits_int_le(int n) { + uint64_t res = 0; + int bits_needed = n - m_bits_left; + + if (bits_needed > 0) { + // 1 bit => 1 byte + // 8 bits => 1 byte + // 9 bits => 2 bytes + int bytes_needed = ((bits_needed - 1) / 8) + 1; // `ceil(bits_needed / 8)` + if (bytes_needed > 8) + throw std::runtime_error("read_bits_int_le: more than 8 bytes requested"); + uint8_t buf[8]; + m_io->read(reinterpret_cast(buf), bytes_needed); + for (int i = 0; i < bytes_needed; i++) { + res |= static_cast(buf[i]) << (i * 8); + } + + // NB: for bit shift operators in C++, "if the value of the right operand is + // negative or is greater or equal to the number of bits in the promoted left + // operand, the behavior is undefined." (see + // https://en.cppreference.com/w/cpp/language/operator_arithmetic#Bitwise_shift_operators) + // So we define our desired behavior here. + uint64_t new_bits = bits_needed < 64 ? res >> bits_needed : 0; + res = res << m_bits_left | m_bits; + m_bits = new_bits; + } else { + res = m_bits; + m_bits >>= n; + } + + m_bits_left = -bits_needed & 7; // `-bits_needed mod 8` + + if (n < 64) { + uint64_t mask = (static_cast(1) << n) - 1; + res &= mask; + } + // if `n == 64`, do nothing + return res; +} + +// ======================================================================== +// Byte arrays +// ======================================================================== + +std::string kaitai::kstream::read_bytes(std::streamsize len) { + std::vector result(len); + + // NOTE: streamsize type is signed, negative values are only *supposed* to not be used. + // http://en.cppreference.com/w/cpp/io/streamsize + if (len < 0) { + throw std::runtime_error("read_bytes: requested a negative amount"); + } + + if (len > 0) { + m_io->read(&result[0], len); + } + + return std::string(result.begin(), result.end()); +} + +std::string kaitai::kstream::read_bytes_full() { + std::iostream::pos_type p1 = m_io->tellg(); + m_io->seekg(0, std::ios::end); + std::iostream::pos_type p2 = m_io->tellg(); + size_t len = p2 - p1; + + // Note: this requires a std::string to be backed with a + // contiguous buffer. Officially, it's a only requirement since + // C++11 (C++98 and C++03 didn't have this requirement), but all + // major implementations had contiguous buffers anyway. + std::string result(len, ' '); + m_io->seekg(p1); + m_io->read(&result[0], len); + + return result; +} + +std::string kaitai::kstream::read_bytes_term(char term, bool include, bool consume, bool eos_error) { + std::string result; + std::getline(*m_io, result, term); + if (m_io->eof()) { + // encountered EOF + if (eos_error) { + throw std::runtime_error("read_bytes_term: encountered EOF"); + } + } else { + // encountered terminator + if (include) + result.push_back(term); + if (!consume) + m_io->unget(); + } + return result; +} + +std::string kaitai::kstream::ensure_fixed_contents(std::string expected) { + std::string actual = read_bytes(expected.length()); + + if (actual != expected) { + // NOTE: I think printing it outright is not best idea, it could contain non-ASCII characters + // like backspace and beeps and whatnot. It would be better to print hexlified version, and + // also to redirect it to stderr. + throw std::runtime_error("ensure_fixed_contents: actual data does not match expected data"); + } + + return actual; +} + +std::string kaitai::kstream::bytes_strip_right(std::string src, char pad_byte) { + std::size_t new_len = src.length(); + + while (new_len > 0 && src[new_len - 1] == pad_byte) + new_len--; + + return src.substr(0, new_len); +} + +std::string kaitai::kstream::bytes_terminate(std::string src, char term, bool include) { + std::size_t new_len = 0; + std::size_t max_len = src.length(); + + while (new_len < max_len && src[new_len] != term) + new_len++; + + if (include && new_len < max_len) + new_len++; + + return src.substr(0, new_len); +} + +// ======================================================================== +// Byte array processing +// ======================================================================== + +std::string kaitai::kstream::process_xor_one(std::string data, uint8_t key) { + size_t len = data.length(); + std::string result(len, ' '); + + for (size_t i = 0; i < len; i++) + result[i] = data[i] ^ key; + + return result; +} + +std::string kaitai::kstream::process_xor_many(std::string data, std::string key) { + size_t len = data.length(); + size_t kl = key.length(); + std::string result(len, ' '); + + size_t ki = 0; + for (size_t i = 0; i < len; i++) { + result[i] = data[i] ^ key[ki]; + ki++; + if (ki >= kl) + ki = 0; + } + + return result; +} + +std::string kaitai::kstream::process_rotate_left(std::string data, int amount) { + size_t len = data.length(); + std::string result(len, ' '); + + for (size_t i = 0; i < len; i++) { + uint8_t bits = data[i]; + result[i] = (bits << amount) | (bits >> (8 - amount)); + } + + return result; +} + +#ifdef KS_ZLIB +#include + +std::string kaitai::kstream::process_zlib(std::string data) { + int ret; + + unsigned char *src_ptr = reinterpret_cast(&data[0]); + std::stringstream dst_strm; + + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + + ret = inflateInit(&strm); + if (ret != Z_OK) + throw std::runtime_error("process_zlib: inflateInit error"); + + strm.next_in = src_ptr; + strm.avail_in = data.length(); + + unsigned char outbuffer[ZLIB_BUF_SIZE]; + std::string outstring; + + // get the decompressed bytes blockwise using repeated calls to inflate + do { + strm.next_out = reinterpret_cast(outbuffer); + strm.avail_out = sizeof(outbuffer); + + ret = inflate(&strm, 0); + + if (outstring.size() < strm.total_out) + outstring.append(reinterpret_cast(outbuffer), strm.total_out - outstring.size()); + } while (ret == Z_OK); + + if (ret != Z_STREAM_END) { // an error occurred that was not EOF + std::ostringstream exc_msg; + exc_msg << "process_zlib: error #" << ret << "): " << strm.msg; + throw std::runtime_error(exc_msg.str()); + } + + if (inflateEnd(&strm) != Z_OK) + throw std::runtime_error("process_zlib: inflateEnd error"); + + return outstring; +} +#endif + +// ======================================================================== +// Misc utility methods +// ======================================================================== + +int kaitai::kstream::mod(int a, int b) { + if (b <= 0) + throw std::invalid_argument("mod: divisor b <= 0"); + int r = a % b; + if (r < 0) + r += b; + return r; +} + +#include +void kaitai::kstream::unsigned_to_decimal(uint64_t number, char *buffer) { + // Implementation from https://ideone.com/nrQfA8 by Alf P. Steinbach + // (see https://www.zverovich.net/2013/09/07/integer-to-string-conversion-in-cplusplus.html#comment-1033931478) + if (number == 0) { + *buffer++ = '0'; + } else { + char *p_first = buffer; + while (number != 0) { + *buffer++ = static_cast('0' + number % 10); + number /= 10; + } + std::reverse(p_first, buffer); + } + *buffer = '\0'; +} + +int64_t kaitai::kstream::string_to_int(const std::string& str, int base) { + char *str_end; + + errno = 0; + int64_t res = strtoll(str.c_str(), &str_end, base); + + // Check for successful conversion and throw an exception if the entire string was not converted + if (str_end != str.c_str() + str.size()) { + throw std::invalid_argument("string_to_int"); + } + + if (errno == ERANGE) { + throw std::out_of_range("string_to_int"); + } + + return res; +} + +std::string kaitai::kstream::reverse(std::string val) { + std::reverse(val.begin(), val.end()); + + return val; +} + +uint8_t kaitai::kstream::byte_array_min(const std::string val) { + uint8_t min = 0xff; // UINT8_MAX + std::string::const_iterator end = val.end(); + for (std::string::const_iterator it = val.begin(); it != end; ++it) { + uint8_t cur = static_cast(*it); + if (cur < min) { + min = cur; + } + } + return min; +} + +uint8_t kaitai::kstream::byte_array_max(const std::string val) { + uint8_t max = 0; // UINT8_MIN + std::string::const_iterator end = val.end(); + for (std::string::const_iterator it = val.begin(); it != end; ++it) { + uint8_t cur = static_cast(*it); + if (cur > max) { + max = cur; + } + } + return max; +} + +// ======================================================================== +// Other internal methods +// ======================================================================== + +#ifndef KS_STR_DEFAULT_ENCODING +#define KS_STR_DEFAULT_ENCODING "UTF-8" +#endif + +#ifdef KS_STR_ENCODING_ICONV + +#include +#include +#include + +std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { + iconv_t cd = iconv_open(KS_STR_DEFAULT_ENCODING, src_enc); + + if (cd == (iconv_t)-1) { + if (errno == EINVAL) { + throw unknown_encoding(src_enc); + } else { + throw bytes_to_str_error("error opening iconv"); + } + } + + size_t src_len = src.length(); + size_t src_left = src_len; + + // Start with a buffer length of double the source length. + size_t dst_len = src_len * 2; + std::string dst(dst_len, ' '); + size_t dst_left = dst_len; + + // NB: this should be const char *, but for some reason iconv() requires non-const in its 2nd argument, + // so we force it with a cast. + char *src_ptr = const_cast(src.data()); + char *dst_ptr = &dst[0]; + + while (true) { + size_t res = iconv(cd, &src_ptr, &src_left, &dst_ptr, &dst_left); + + if (res == (size_t)-1) { + if (errno == E2BIG) { + // dst buffer is not enough to accomodate whole string + // enlarge the buffer and try again + size_t dst_used = dst_len - dst_left; + dst_left += dst_len; + dst_len += dst_len; + dst.resize(dst_len); + + // dst.resize might have allocated destination buffer in another area + // of memory, thus our previous pointer "dst" will be invalid; re-point + // it using "dst_used". + dst_ptr = &dst[dst_used]; + } else if (errno == EILSEQ) { + throw illegal_seq_in_encoding("EILSEQ"); + } else if (errno == EINVAL) { + throw illegal_seq_in_encoding("EINVAL"); + } else { + throw bytes_to_str_error(to_string(errno)); + } + } else { + // conversion successful + dst.resize(dst_len - dst_left); + break; + } + } + + if (iconv_close(cd) != 0) { + throw bytes_to_str_error("iconv close error"); + } + + return dst; +} +#elif defined(KS_STR_ENCODING_NONE) +std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { + return src; +} +#elif defined(KS_STR_ENCODING_WIN32API) +#include +#include + +// Unbreak std::numeric_limits::max, as otherwise MSVC substitutes "useful" max() macro. +#undef max + +int kaitai::kstream::encoding_to_win_codepage(const char *src_enc) { + std::string enc(src_enc); + if (enc == "UTF-8") { + return CP_UTF8; + } else if (enc == "UTF-16LE") { + return KAITAI_CP_UTF16LE; + } else if (enc == "UTF-16BE") { + return KAITAI_CP_UTF16BE; + } else if (enc == "IBM437") { + return 437; + } else if (enc == "IBM850") { + return 850; + } else if (enc == "SHIFT_JIS") { + return 932; + } else if (enc == "GB2312") { + return 936; + } else if (enc == "ASCII") { + return 20127; + } else if (enc == "EUC-JP") { + return 20932; + } else if (enc == "ISO-8859-1") { + return 28591; + } else if (enc == "ISO-8859-2") { + return 28592; + } else if (enc == "ISO-8859-3") { + return 28593; + } else if (enc == "ISO-8859-4") { + return 28594; + } else if (enc == "ISO-8859-5") { + return 28595; + } else if (enc == "ISO-8859-6") { + return 28596; + } else if (enc == "ISO-8859-7") { + return 28597; + } else if (enc == "ISO-8859-8") { + return 28598; + } else if (enc == "ISO-8859-9") { + return 28599; + } else if (enc == "ISO-8859-10") { + return 28600; + } else if (enc == "ISO-8859-11") { + return 28601; + } else if (enc == "ISO-8859-13") { + return 28603; + } else if (enc == "ISO-8859-14") { + return 28604; + } else if (enc == "ISO-8859-15") { + return 28605; + } else if (enc == "ISO-8859-16") { + return 28606; + } + + return KAITAI_CP_UNSUPPORTED; +} + +std::string kaitai::kstream::bytes_to_str(const std::string src, const char *src_enc) { + // Step 1: convert encoding name to codepage number + int codepage = encoding_to_win_codepage(src_enc); + if (codepage == KAITAI_CP_UNSUPPORTED) { + throw unknown_encoding(src_enc); + } + return bytes_to_str(src, codepage); +} + +std::string kaitai::kstream::bytes_to_str(const std::string src, int codepage) { + // Shortcut: if we're already in UTF-8, no need to convert anything + if (codepage == CP_UTF8) { + return src; + } + + // Step 2: convert bytes to UTF-16 ("wide char") string + std::wstring utf16; + int32_t utf16_len; + int32_t src_len; + if (src.length() > std::numeric_limits::max()) { + throw bytes_to_str_error("buffers longer than int32_t are unsupported"); + } else { + src_len = static_cast(src.length()); + } + + switch (codepage) { + case KAITAI_CP_UTF16LE: + // If our source is already UTF-16LE, just copy it + + if (src_len % 2 != 0) { + throw illegal_seq_in_encoding("incomplete"); + } + + utf16_len = src_len / 2; + utf16 = std::wstring((wchar_t*)src.c_str(), utf16_len); + break; + case KAITAI_CP_UTF16BE: + // If our source is in UTF-16BE, convert it to UTF-16LE by swapping bytes + + if (src_len % 2 != 0) { + throw illegal_seq_in_encoding("incomplete"); + } + + utf16_len = src_len / 2; + + utf16 = std::wstring(utf16_len, L'\0'); + for (int32_t i = 0; i < utf16_len; i++) { + utf16[i] = (static_cast(src[i * 2]) << 8) | static_cast(src[i * 2 + 1]); + } + break; + default: + // Calculate the length of the UTF-16 string + utf16_len = MultiByteToWideChar(codepage, 0, src.c_str(), src_len, 0, 0); + if (utf16_len == 0) { + throw bytes_to_str_error("MultiByteToWideChar length calculation error"); + } + + // Convert to UTF-16 string + utf16 = std::wstring(utf16_len, L'\0'); + if (MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, src.c_str(), src_len, &utf16[0], utf16_len) == 0) { + auto err = GetLastError(); + if (err == ERROR_NO_UNICODE_TRANSLATION) { + throw illegal_seq_in_encoding("MultiByteToWideChar"); + } else { + throw bytes_to_str_error("MultiByteToWideChar conversion error"); + } + } + } + + // Step 3: convert UTF-16 string to UTF-8 string + + // Calculate the length of the UTF-8 string + int utf8_len = WideCharToMultiByte(CP_UTF8, 0, &utf16[0], utf16_len, 0, 0, 0, 0); + if (utf8_len == 0) { + throw bytes_to_str_error("WideCharToMultiByte length calculation error"); + } + + // Convert to UTF-8 string + std::string utf8(utf8_len, '\0'); + if (WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, &utf16[0], utf16_len, &utf8[0], utf8_len, 0, 0) == 0) { + auto err = GetLastError(); + if (err == ERROR_NO_UNICODE_TRANSLATION) { + throw illegal_seq_in_encoding("WideCharToMultiByte"); + } else { + throw bytes_to_str_error("WideCharToMultiByte conversion error"); + } + } + + return utf8; +} + +#else +#error Need to decide how to handle strings: please define one of: KS_STR_ENCODING_ICONV, KS_STR_ENCODING_WIN32API, KS_STR_ENCODING_NONE +#endif diff --git a/third_party/kaitai/kaitaistream.h b/third_party/kaitai/kaitaistream.h new file mode 100644 index 0000000..3523ed1 --- /dev/null +++ b/third_party/kaitai/kaitaistream.h @@ -0,0 +1,374 @@ +#ifndef KAITAI_STREAM_H +#define KAITAI_STREAM_H + +// Kaitai Struct runtime API version: x.y.z = 'xxxyyyzzz' decimal +#define KAITAI_STRUCT_VERSION 10000L + +#include +#include +#include +#include +#include +#include +#include + +namespace kaitai { + +/** + * Kaitai Stream class (kaitai::kstream) is an implementation of + * Kaitai Struct stream API + * for C++/STL. It's implemented as a wrapper over generic STL std::istream. + * + * It provides a wide variety of simple methods to read (parse) binary + * representations of primitive types, such as integer and floating + * point numbers, byte arrays and strings, and also provides stream + * positioning / navigation methods with unified cross-language and + * cross-toolkit semantics. + * + * Typically, end users won't access Kaitai Stream class manually, but would + * describe a binary structure format using .ksy language and then would use + * Kaitai Struct compiler to generate source code in desired target language. + * That code, in turn, would use this class and API to do the actual parsing + * job. + */ +class kstream { +public: + /** + * Constructs new Kaitai Stream object, wrapping a given std::istream. + * \param io istream object to use for this Kaitai Stream + */ + kstream(std::istream* io); + + /** + * Constructs new Kaitai Stream object, wrapping a given in-memory data + * buffer. + * \param data data buffer to use for this Kaitai Stream + */ + kstream(const std::string& data); + + void close(); + + /** @name Stream positioning */ + //@{ + /** + * Check if stream pointer is at the end of stream. Note that the semantics + * are different from traditional STL semantics: one does *not* need to do a + * read (which will fail) after the actual end of the stream to trigger EOF + * flag, which can be accessed after that read. It is sufficient to just be + * at the end of the stream for this method to return true. + * \return "true" if we are located at the end of the stream. + */ + bool is_eof() const; + + /** + * Set stream pointer to designated position. + * \param pos new position (offset in bytes from the beginning of the stream) + */ + void seek(uint64_t pos); + + /** + * Get current position of a stream pointer. + * \return pointer position, number of bytes from the beginning of the stream + */ + uint64_t pos(); + + /** + * Get total size of the stream in bytes. + * \return size of the stream in bytes + */ + uint64_t size(); + //@} + + /** @name Integer numbers */ + //@{ + + // ------------------------------------------------------------------------ + // Signed + // ------------------------------------------------------------------------ + + int8_t read_s1(); + + // ........................................................................ + // Big-endian + // ........................................................................ + + int16_t read_s2be(); + int32_t read_s4be(); + int64_t read_s8be(); + + // ........................................................................ + // Little-endian + // ........................................................................ + + int16_t read_s2le(); + int32_t read_s4le(); + int64_t read_s8le(); + + // ------------------------------------------------------------------------ + // Unsigned + // ------------------------------------------------------------------------ + + uint8_t read_u1(); + + // ........................................................................ + // Big-endian + // ........................................................................ + + uint16_t read_u2be(); + uint32_t read_u4be(); + uint64_t read_u8be(); + + // ........................................................................ + // Little-endian + // ........................................................................ + + uint16_t read_u2le(); + uint32_t read_u4le(); + uint64_t read_u8le(); + + //@} + + /** @name Floating point numbers */ + //@{ + + // ........................................................................ + // Big-endian + // ........................................................................ + + float read_f4be(); + double read_f8be(); + + // ........................................................................ + // Little-endian + // ........................................................................ + + float read_f4le(); + double read_f8le(); + + //@} + + /** @name Unaligned bit values */ + //@{ + + void align_to_byte(); + uint64_t read_bits_int_be(int n); + uint64_t read_bits_int(int n); + uint64_t read_bits_int_le(int n); + + //@} + + /** @name Byte arrays */ + //@{ + + std::string read_bytes(std::streamsize len); + std::string read_bytes_full(); + std::string read_bytes_term(char term, bool include, bool consume, bool eos_error); + std::string ensure_fixed_contents(std::string expected); + + static std::string bytes_strip_right(std::string src, char pad_byte); + static std::string bytes_terminate(std::string src, char term, bool include); + static std::string bytes_to_str(const std::string src, const char *src_enc); + + //@} + + /** @name Byte array processing */ + //@{ + + /** + * Performs a XOR processing with given data, XORing every byte of input with a single + * given value. + * @param data data to process + * @param key value to XOR with + * @return processed data + */ + static std::string process_xor_one(std::string data, uint8_t key); + + /** + * Performs a XOR processing with given data, XORing every byte of input with a key + * array, repeating key array many times, if necessary (i.e. if data array is longer + * than key array). + * @param data data to process + * @param key array of bytes to XOR with + * @return processed data + */ + static std::string process_xor_many(std::string data, std::string key); + + /** + * Performs a circular left rotation shift for a given buffer by a given amount of bits, + * using groups of 1 bytes each time. Right circular rotation should be performed + * using this procedure with corrected amount. + * @param data source data to process + * @param amount number of bits to shift by + * @return copy of source array with requested shift applied + */ + static std::string process_rotate_left(std::string data, int amount); + +#ifdef KS_ZLIB + /** + * Performs an unpacking ("inflation") of zlib-compressed data with usual zlib headers. + * @param data data to unpack + * @return unpacked data + * @throws IOException + */ + static std::string process_zlib(std::string data); +#endif + + //@} + + /** + * Performs modulo operation between two integers: dividend `a` + * and divisor `b`. Divisor `b` is expected to be positive. The + * result is always 0 <= x <= b - 1. + */ + static int mod(int a, int b); + + /** + * Converts given integer `val` to a decimal string representation. + * Should be used in place of std::to_string() (which is available only + * since C++11) in older C++ implementations. + */ + template +// check for C++11 support - https://stackoverflow.com/a/40512515 +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) + // https://stackoverflow.com/a/27913885 + typename std::enable_if< + std::is_integral::value && + // check if we don't have something too large like GCC's `__int128_t` + std::numeric_limits::max() >= 0 && + std::numeric_limits::max() <= std::numeric_limits::max(), + std::string + >::type +#else + std::string +#endif + static to_string(I val) { + // in theory, `digits10 + 3` would be enough (minus sign + leading digit + // + null terminator), but let's add a little more to be safe + char buf[std::numeric_limits::digits10 + 5]; + if (val < 0) { + buf[0] = '-'; + + // NB: `val` is negative and we need to get its absolute value (i.e. minus `val`). However, since + // `int64_t` uses two's complement representation, its range is `[-2**63, 2**63 - 1] = + // [-0x8000_0000_0000_0000, 0x7fff_ffff_ffff_ffff]` (both ends inclusive) and thus the naive + // `-val` operation will overflow for `val = std::numeric_limits::min() = + // -0x8000_0000_0000_0000` (because the result of `-val` is mathematically + // `-(-0x8000_0000_0000_0000) = 0x8000_0000_0000_0000`, but the `int64_t` type can represent at + // most `0x7fff_ffff_ffff_ffff`). And signed integer overflow is undefined behavior in C++. + // + // To avoid undefined behavior for `val = -0x8000_0000_0000_0000 = -2**63`, we do the following + // steps for all negative `val`s: + // + // 1. Convert the signed (and negative) `val` to an unsigned `uint64_t` type. This is a + // well-defined operation in C++: the resulting `uint64_t` value will be `val mod 2**64` (`mod` + // is modulo). The maximum `val` we can have here is `-1` (because `val < 0`), a theoretical + // minimum we are able to support would be `-2**64 + 1 = -0xffff_ffff_ffff_ffff` (even though + // in practice the widest standard type is `int64_t` with the minimum of `-2**63`): + // + // * `static_cast(-1) = -1 mod 2**64 = 2**64 + (-1) = 0xffff_ffff_ffff_ffff = 2**64 - 1` + // * `static_cast(-2**64 + 1) = (-2**64 + 1) mod 2**64 = 2**64 + (-2**64 + 1) = 1` + // + // 2. Subtract `static_cast(val)` from `2**64 - 1 = 0xffff_ffff_ffff_ffff`. Since + // `static_cast(val)` is in range `[1, 2**64 - 1]` (see step 1), the result of this + // subtraction will be mathematically in range `[0, (2**64 - 1) - 1] = [0, 2**64 - 2]`. So the + // mathematical result cannot be negative, hence this unsigned integer subtraction can never + // wrap around (which wouldn't be a good thing to rely upon because it confuses programmers and + // code analysis tools). + // + // 3. Since we did mathematically `(2**64 - 1) - (2**64 + val) = -val - 1` so far (and we wanted + // to do `-val`), we add `1` to correct that. From step 2 we know that the result of `-val - 1` + // is in range `[0, 2**64 - 2]`, so adding `1` will not wrap (at most we could get `2**64 - 1 = + // 0xffff_ffff_ffff_ffff`, which is still in the valid range of `uint64_t`). + + unsigned_to_decimal((std::numeric_limits::max() - static_cast(val)) + 1, &buf[1]); + } else { + unsigned_to_decimal(val, buf); + } + return std::string(buf); + } + + /** + * Converts string `str` to an integer value. Throws an exception if the + * string is not a valid integer. + * + * This one is supposed to mirror `std::stoll()` (which is available only + * since C++11) in older C++ implementations. + * + * Major difference between standard `std::stoll()` and `string_to_int()` + * is that this one does not perform any partial conversions and always + * throws `std::invalid_argument` if the string is not a valid integer. + * + * @param str String to convert + * @param base Base of the integer (default: 10) + * @throws std::invalid_argument if the string is not a valid integer + * @throws std::out_of_range if the integer is out of range + * @return Integer value of the string + */ + static int64_t string_to_int(const std::string& str, int base = 10); + + /** + * Reverses given string `val`, so that the first character becomes the + * last and the last one becomes the first. This should be used to avoid + * the need of local variables at the caller. + */ + static std::string reverse(std::string val); + + /** + * Finds the minimal byte in a byte array, treating bytes as + * unsigned values. + * @param val byte array to scan + * @return minimal byte in byte array as integer + */ + static uint8_t byte_array_min(const std::string val); + + /** + * Finds the maximal byte in a byte array, treating bytes as + * unsigned values. + * @param val byte array to scan + * @return maximal byte in byte array as integer + */ + static uint8_t byte_array_max(const std::string val); + +private: + std::istream* m_io; + std::istringstream m_io_str; + int m_bits_left; + uint64_t m_bits; + + void init(); + void exceptions_enable() const; + + static void unsigned_to_decimal(uint64_t number, char *buffer); + +#ifdef KS_STR_ENCODING_WIN32API + enum { + KAITAI_CP_UNSUPPORTED = -1, + KAITAI_CP_UTF16LE = -2, + KAITAI_CP_UTF16BE = -3, + }; + + /** + * Converts string name of the encoding into a Windows codepage number. We extend standard Windows codepage list + * with a few special meanings (see KAITAI_CP_* enum), reserving negative values of integer for that. + * @param src_enc string name of the encoding; this should match canonical name of the encoding as per discussion + * in https://github.com/kaitai-io/kaitai_struct/issues/116 + * @return Windows codepage number or member of KAITAI_CP_* enum. + * @ref https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers + */ + static int encoding_to_win_codepage(const char *src_enc); + + /** + * Converts bytes packed in std::string into a UTF-8 string, based on given source encoding indicated by `codepage`. + * @param src bytes to be converted + * @param codepage Windows codepage number or member of KAITAI_CP_* enum. + * @return UTF-8 string + */ + static std::string bytes_to_str(const std::string src, int codepage); +#endif + + static const int ZLIB_BUF_SIZE = 128 * 1024; +}; + +} + +#endif diff --git a/third_party/kaitai/kaitaistruct.h b/third_party/kaitai/kaitaistruct.h new file mode 100644 index 0000000..8172ede --- /dev/null +++ b/third_party/kaitai/kaitaistruct.h @@ -0,0 +1,20 @@ +#ifndef KAITAI_STRUCT_H +#define KAITAI_STRUCT_H + +#include + +namespace kaitai { + +class kstruct { +public: + kstruct(kstream *_io) { m__io = _io; } + virtual ~kstruct() {} +protected: + kstream *m__io; +public: + kstream *_io() { return m__io; } +}; + +} + +#endif