From fc7d3b73eb1fc6ddbc769dfb4021304772750737 Mon Sep 17 00:00:00 2001
From: Thomas VINCENT <thomas.vincent@esrf.fr>
Date: Fri, 21 Oct 2022 10:42:04 +0200
Subject: [PATCH] Squashed 'src/zfp/' changes from e8edaced..f39af726

f39af726 Release 1.0.0
82a557c8 Set release version and date
7f84893b Clarify ZFP_VERSION_DEVELOP meaning and value
074f0109 Point badges and documentation to release1.0.0 branch
1fc3539e Align license.rst with LICENSE
ed01090c Add version_tweak to zFORp
9ac63801 Document change in ZFP_VERSION mapping
fff64a14 Revert to gencodec for consistency with zfpcodec
07485e1a Update ReadTheDocs badge link
e1af8487 Bump copyright date, add copyright notice to zfp.hpp
2445a7c3 Bump zFORp_library_version
3ed74345 Ensure make clean removes libcfp
6c66c665 Proofread and ensure documentation is up to date
9cc3a25b Update expected testzfp errors for large arrays
8efecf10 Fix various bugs in diffusionC
49db1371 Fix incorrect order of distance() arguments
afabe40b Add cfp references, pointer, iterators, serialization to CHANGELOG
302d0a73 Fix C89 incompatibility in iteratorC.c
a9203c64 Add missing const_array::params() and const_array::set_params()
c6974ace Add zfp_field_blocks() to zFORp
46deb610 Update execution doc with planned future capabilities
e4357d0e Cosmetic changes to compression modes doc
f88a4131 Add variable-length code example
3deba5f1 Add Apple clang OpenMP instructions
bc29521b Update authors, zfp homepage URL
43d29aff Update links to zfp applications
c9ec980f Update zfp homepage URL
3c544564 Update authors and contributors
4ce851be Update versions.rst to align with CHANGELOG.md
44d4f849 Remove unused variable in decoder test
9a9a2263 update doc versions and add some missing changes
962a48bd update version in license and tests
c86be611 Update zfpy version number
c99a41f4 Fix mismatched memory (de)allocation calls
6825ee26 Add missing space between literal and identifier
91f1823f Update version identifiers for next release
cb618438 Correct suffix for examples/array2d.h
539db086 Update CHANGELOG to reflect directory restructuring
1c66f76b Silence compiler warnings in tests
b2ea0416 Remove -I../array
1bcbf0ec Ensure proper make clean in examples
8eecbfbb Document new name for zFORp module
ec96f631 Document lower bound on maxbits in expert mode
c597619c Fix compiler warnings in tests
b1ce0c16 Merge branch 'develop' into refactor/install-directory
73aeb464 Fix compiler warnings/errors in examples
7edcd917 Update time limit on gitlab tests
725f5cd7 remove fortran modules directory from git tracking
982ca868 Update cfp header includes, rename main cfp header
0f530708 Silence copy constructor warning
54966d1e Fix gmake build issues
7c9b4d23 Merge branch 'develop' into refactor/install-directory
81dbdd02 Fix Fortran GNU builds and compiler requirements
97fa1d75 Disable thread safety checks when OpenMP is absent
6c46dccf Update makefiles and fix a few inconsistencies
f0899413 Clarify that low-level API ignores execution policy
736581dd Silence implicit conversion warnings
86ffc4f2 Rename ZFP_VERSION_DEVELOP, remove ZFP_VERSION_RELEASE
0135c037 Merge branch 'bugfix/thread-safety' into develop (resolves #154)
20a2efdc Use zfp_field_blocks() in codec
5c424bbf Update CHANGELOG [skip ci]
11f99284 Enabled OpenMP with testviews
e89d8038 Handle empty field in zfp_field_blocks()
0447f7e5 Make codec thread-safe
b9166ad1 Add LDFLAGS to support -lomp on Mac
5190cae2 Add zfp_field_blocks()
e1ac5e4e Merge pull request #165 from DimitriPapadopoulos/codespell
20e62c58 Fix typos found by codespell
05b3f4c5 Update fortran module name and move internal codec headers
c13a6c7b Update bitstream.c include in cuda [skip ci]
9e78178a Fix include in zfpy
3674669d Update zforp install location
5fd6de39 update zfp util
f56070f6 updated tests fixed additional missing includes
615e86f4 Update headers in examples and fix bad include
eeac3a28 Updated cfp includes, fixed build issues
c7326188 Update cmake
b2366c0e updated inline src and moved cfp src
4aafbc81 Update hpp include guards
afe6a01f Update includes for internal headers
013210b0 Update includes for all non-internal headers
e1e4840c Move include files
9b0d278f Update where gitlab cpu tests are run
26678cde Replace stdio with iostream in testviews
bb59d845 Fix minor documentation issues
537c1205 Add compressed-array example
cc2e8500 add policy mode validation checks
938ffd2e change exec policy params to use void*
7335959b Merge branch 'refactor/stream-offsets' into develop
24bcfa28 Merge branch 'develop' into refactor/stream-offsets
2d975ce8 Correct type mismatches to silence compiler warnings
bf718a60 Update bitstream types in CUDA tests
9a1e3409 Introduce bitstream_{offset,size,count} types for 64-bit support
b40ab53d temporarily disable actions side of cmake test
9dcf734c temporarily disable actions side of cmake test
72fcac57 Add cmake version checking to tests
980155a8 Merge pull request #162 from jonashaag/patch-1
cf7bd9ea Fix exception handling in compress_numpy
5d3ac203 Revert accidental change to ZFP_VERSION_RELEASE define
0335096e Fix version.h compile error
dd915b95 Update version files to support intermediate version numbers
15d77178 Move memory management to internal namespace
0f06782e Rename word -> stream_word
1d6fbe59 Silence compiler warnings in tests
f49a7fe5 Silence compiler warnings in tests
baaa782a Remove debugging code from diffusion example
76d684de Update CMakeLists to support OpenMP with AppleClang
19b261d6 Add macros for constructing version number and string
ad926b6a Add back references to execution section in docs
f6880a6a Add #150 to CHANGELOG
d36f54d9 Fixes #150
6bfd003a Revert attempted fix for #150
e620e546 Undo failed attempt to fix #150
a7c6047f Attempted fix for issue #150 (cmocka macOS build)
da4e2829 Allow setting BIT_STREAM_WORD_TYPE on make line
33065d23 Add support for generic codec in diffusion example
f4e0850c Clarify that rounding mode, daz supported only on CPU
2b03babb Add conda zfp package and zarr application to docs
4a33aab5 add missing zfp field and stream tests
38e3cbbe Silence const compiler warnings in ppm
aeb707c1 Add CUDA tests for unsupported 4D data
5fe5ed19 Change name of actions workflow
ba2df4bc move gitlab cpu tests to new system
a674bf58 Remove Travis CI files
fd573184 Move status badge from travis to actions
64e5db65 Update python cmake build scripts
7ad84769 Add python build hints to tests
50774dcf Enable cython action temporarily disable win tests
d42fb0ee enable actions pytests and split py adv decompress test into bytes/memview versions
afac3f94 made test output more verbose
31902956 remove unneeded test code
84fb3d88 Make array::set(0) zero-initialize array
323e3e70 Move Cache and Store to internal namespace
eb6cd287 Bug fix: reopen codec when compacting store
122918c4 Fix incorrect loop counter types in simple.c
662fc0f4 Add size_bytes() to cfp
05156f50 Replace ieeecodec with generic codec templated on scalar type
8441d706 Fix inconsistent coding style in zfpcodec
142a11f7 Setup github actions config
5555333a add promote/demote util tests
b91dcafb Added field_metadata failure tests
6aec9dc4 compilation error fix
de381394 Fix unused variable warning
d65af032 Add link to MVAPICH2-GDR
5c991daa Add/rename top-level files for E4S compliance
99e526cd Document CUDA limitations
22beacb0 Document ZFP_WITH_CUDA in config file
57c50e03 Add FAQ on error distributions
bbfb88db Clarify FAQ #17 on violated tolerances
1cd5978f Add printf FAQ
95dc3582 Merge pull request #148 from vicentebolea/cmake-fix-include-dir
b6f344ff CMAKE: ZFP_INCLUDE_DIRS avail at configure time
310e12d5 Merge pull request #147 from vicentebolea/add-zfp-cuda-var
67036823 Update zfp-config.cmake.in
f5d8c282 Add templated encode/decode tests
f65ac443 Move cuda testing to new machine
5079b895 Extend and include testviews in tests
2d0ba85f Fix view const accessors
a922f640 Clean up .travis.yml
1a4565c6 Force gcc-4.9 on Travis CI
416e1957 Require CMake 3.9 in tests
af933bb9 Suppress CMake warning about unused ZFP_OMP_TESTS_ONLY
f718c561 Add nullptr_t proxy pointer constructors
d83d3432 Update README build report badge
bfbaa992 Undo attempted codecov fix
1e74a04c Add mask parameter to header.size_bytes()
62569178 Bug fix: overflow due to subnormals (resolves #119)
3f768fad Fix zfp_field bugs that confused precision and byte size
ecbbf763 Fix missing codecov uploads
ef8b8791 Document C++ wrappers for low-level API
8e790cbf Update installation instructions
88164401 Complete citation record
0ff612ec Merge pull request #141 from researchapps/add/github-citation
cd14c063 GitHub has recently added this standard file, CITATION.cff, that will render a sidebar link to easily copy paste bibtex information to cite the work. This should be useful for zfp
33eed2fb Fix missing size_t and declarations in tests
c3a7b334 Fix missing size_t types and casts
19107376 Fix C89 incompatibility in tests
a85609f0 Build cmocka using prescribed build type for MSVC
149de0a5 Update cfp tests to use size_t
416d16b8 Complete size_t transition for array tests
83d572f1 Fix broken C89 support in tests
45c9e82b Silence warnings and use size_t over uint in tests
7450b728 Silence miscellaneous compiler warnings
ce5c5b3a Silence compiler warnings
4e49b318 Fix gtest deprecation warnings
6443612c Merge branch 'develop' of github.com:LLNL/zfp into develop
02872e2a added templated encode/decode block tests
72278260 Merge branch 'refactor/64-bit-indexing' into develop
954390e1 Silence compiler warnings in tests
2a2ff357 Update docs with new zfp_field and function signatures
28a3858b Silence compiler warnings in tests
d97e589b Update examples and utils to use size_t array dimensions
b756eb04 Handle missing size_t/ptrdiff_t
c3672137 Update Fortran API to support 64-bit indexing
64f71dde Update Python API to support 64-bit indexing
5f3ad37d 64-bit indexing: replace uint/int with size_t/ptrdiff_t
24bcaf80 Organized tests/array directory
ea8124f8 merged cpp namespace into zfp namespace
01ccb95d Clarify constraints on rate in docs
b47749f5 changed gtest version to fix compilation issues with non-gnu linux compilers
496a416c fixed macro expansion issue (commit issue fix)
1c5bc2ab fixed macro expansion issue
c9adc2a4 fixed macro expansion issue
1f5fb92a Rename Travis files
62f35034 Merge branch 'feature/const-arrays' into develop
5bdd22f3 Merge branch 'develop' into feature/const-arrays
26c5eb9d Do not reset rate in zfp::index::implicit::clear()
a571d435 Add appropriate casts around CHAR_BIT * sizeof() constructs
5c7f6ecc Add dispatch function for integer encoding/decoding
6cf405a2 added in const array bitstream checksum test
5f6394ee disable travis tests temporarily
6a7364b3 Fix docs on array serialization constraints
cea4ff6c Merge branch 'feature/unbiased-error' into develop
8a4487e7 Document ZFP_ROUND_* macros
bd2b1545 Fix CUDA rounding issue due to incorrect NBMASK
3205d721 Add rounding modes to CUDA implementation
4f3f1d4a Disabled travis linux testing due to new usage restrictions
caf14c7f Added index names to test reporting
a949b1c5 Added block index tests for const arrays
2968e8e0 Specialize coding for unconstrained rate
74a11efe Merge pull request #137 from jwsblokland/bugfix/cmake
2fba7f69 CMake: (fix) OpenMP dependency.
f25982ad Merge branch 'misc/travis-update' into develop
eb1d4c2d Discontinue xcode7.3 tests
e5a55f76 Fix build errors
4f409922 Added const array checksum tests
97044629 Only call cudaGetDeviceProperties once.
fcae077d Add ability to specify CUDA compilation flags.
8a4db7fb Added in tests for multiple precision/accuracy compression mode values
fb4d7a61 Fixed uninitialized value error in test fixture
09a9f761 Initial work on const array tests
2d92b09d moved version info to zfp/version.h header
6389b89d Implemented fixes discussed in issue #126
12772bd4 Fix incorrect math in relative-error bound FAQ
a51171f1 Document promotion/demotion of scalars
3ddbd24d renamed header.size() to header.size_bytes() to better clarify its functionality
6b2da922 strip out redundant CI builds
11341f46 Add new rounding mode: ZFP_ROUND_FIRST
6d24ac9f Fixed various CI test issues
928bbceb refactored GitlabCI config to be more extensible
f3736ca5 Update diffusion documentation
321ce7fc Add codec and index documentation
68099ac6 Add const_array support for expert mode
1d64a161 Fix off-by-one error in ZFP_MAX_BITS
aabe13f9 Documentation: Use checkmarks in tables
8ded86bd Add FAQ on C vs. Fortran order
ebe66d62 Add FAQ on compressed storage size
d7686bc4 Bug fix: compressed block size in reversible mode (resolves #127)
35bc789a Rename zfp::codec derived classes
d9117612 Rename conflicting zfp_stream* codec::zfp::zfp
6b6602b6 Add zfp::codec namespace
24f9b070 Add support for const_array in diffusion example
d2932309 Fix incorrect rate granularity in FAQ
c733c923 Document const_array rate granularity and zero initialization
1d38a7f0 Refactor and fix bugs in arrays, store, index, codec
6993e72d Add exception on cache overflow
93780bb1 Add size_bytes() to fixed-rate arrays
4abe3d5c Fix incorrect cache size initialization
71d24c01 Add missing size_bytes() for block stores
4142c999 Remove debug info
bb4e567a Add const_array documentation
9f2b09c8 Add missing size_bytes() for block caches
fee7e80e Consolidate shape() and block_shape() functions
b2816e2e fix compiler error during cuda testing
0b4f9a3b Add 1D, 2D, 4D const_arrays
a56a749f Merge branch 'develop' into feature/const-arrays
dd174cd2 Update documentation on exceptions
0ecdc57c Add zfp_field layout functions
4a0d1f3e Use zfp_bool in examples
0ceec10c Ensure diffusionC builds with cfp
ccdec24e Add zfp_bool documentation
53b69c4d Document ppm example
ee3c9fe2 Refactor array/cache/store to support const_array
10612d47 Exclude redundant header inclusion
aa71d4ad Improve POD memory allocation
35086732 Add inclusion of missing headers
1de54120 Add zfp_config for const_array constructor
695886a7 Move install command to utils subdirectory
f4e55986 Change gtest build paths (resolves #108)
ab07ab95 Remove zfp utility as install target
1fa8dbbe Add zfp utility as install target (resolves #122)
a215e19c Merge branch 'bugfix/omp-large-arrays' into develop
67ab4c98 Avoid Win32 size_t overflow
b09d079a Resolves #125
1051621b Pick up .travis.yml fix
915e0414 Squashed commit of the following:
f784bbeb Revert to OpenMP 2.0 to accommodate MSVC
e525e484 Fix OpenMP 32-bit int overflow; require OpenMP 3.0
f668af3d Add support for querying rate, precision, accuracy
d24b528e Fix bad link in file compression docs
a9694007 Bug fix: CUDA bit streams limited to 2^31 bits
8d09ee13 Silence compiler warnings
abc76964 Correct comments/code for coding partial/strided blocks
4fd72928 Rename cfparrays.*; add cfp/src/template directory
779b3665 Minor improvements to simple.c
5cefb373 Clarify zfp_read_header() documentation
7e68696d Consolidate intro and overview sections
1829523e added small fixes for zfp testing, new gitlab CI functionality
74364a8a Merge branch 'feature/cfp-references' into develop
71f13b81 Refactor cfp header API to align with accessors
ccdc3ec6 Silence PGI compiler warnings
5feae4ca Support no alignment via ZFP_CACHE_LINE_SIZE=0
ebb172d0 Propagate zfp_private_defs to tests
a048b3f5 Expose ZFP_CACHE_LINE_SIZE to address PGI compiler bug
e11004ae Add cfp serialization documentation
bf09645b Update Makefiles to simplify macOS builds
7eb57fa0 Clean up cfp docs
7ab1ed61 Update docs to use zfp_bool
9c6032bd Make cfp its own section
6f111ce2 Update cfp documentation
61f7b2ca Add zfp_bool documentation
ac22c584 Clean up merge and :c:expr: Sphinx issues
6db9cf04 Merge branch 'feature/cfp-references' into develop
661cd2aa Add cfp multidim iterator tests
dca15d47 Ensure consistent cfp parameter names and APIs
72e4fd1f Remove unnecessary casts
b2e65852 Add iterators to diffusionC; ensure C89 compatibility
88906c39 Add cfp random-access iterators and const qualifiers; refactor header
03f213ba Rename and reorder accessor members
f7682e65 Refactor cfpheader to avoid circular inclusion
87b770ea changed booleans from int to zfp_bool type
6b8e03dd Added cfp header tests
b20b2bb4 added header functionality to cfp
75c2a59c Avoid importing unused cpython array for compatibility with pypy
50aa4f19 changed uint8_t and memoryview test
29d7de12 change to void, add tests similar to ensure_ndarray
5cb7a325 remove print cython version
89b19d3f remove apt-get cython3
a4d6edac change cython3 to cython
2739f2e7 upgrade pip first
2fc75483 change back to pip
1d9524a5 remove upgrade pip
b560ddcb use pip3 instead pip
66d45d1c change cython3 in .travis.yml
0bed27e1 enable cython3 upgrade on xenial
3c2c0612 add cython version check in travis.sh
8269e09b check cython version in zfpy.pyx
bf05fb43 change to void *
f0f82406 add <char*> in stream_open
31d8c51c remove const for test
a3c572a1 Update python/zfpy.pyx
14da5114 Update python/zfpy.pyx
a75ef261 replace bytes of compressed stream to uint8_t
1f4973df fixed missing semi-colon
ee58ae34 changed cfp 4d fixed rate tests to prevent future problems
a36d4063 Fixed issues with 4d checksum tests
1c312335 Fix failing 4D cfp test due to lack of precision
29c4f5a9 Changed out of date uint references to size_t in docs
0541ec97 Added 4d cfp documentation
15187ed8 Disabled broken checksum tests
42d3c7fe Added cfp 4d tests
a79c4930 Merge branch 'develop' into feature/const-arrays
0817b086 Add documentation for 4D arrays
a5959fe0 Use python3 with sphinx documentation
6f829dd5 Enable 4D array support in cfp
78c2c51f Merge branch 'develop' into feature/cfp-references
c6bcb098 Remove temporary fix to private_view::get()
2fbe0084 Merge branch 'refactor/arrays' into develop
8b794fb8 Fix ppm issues with clamping; add chroma dimensionality parameter
33149742 Add support for 1D chroma blocks in ppm
1fd3a5f8 Revise zfp_bool type for C++ and update function signatures
50b635b4 Add tests for 4D compressed arrays
635db5c5 Fix minor issues with 3D array tests
915ba215 Fix copy-paste errors uncovered by tests
bdb2623c Fix ppm range expansion bug
5816bcde Add ppm example
d2c844c6 Add pointer and iterator tests for views
6a544b05 Add missing const_pointer and const_iterator friends to views
8e1d8dcd Fix testzfp-large tolerances for new default cache size
23bb66f6 Remove obsolete 4D deserialization test
935c3893 changed ctor name
67383cc4 Correct order of arguments in cfp {ptr,iter}.distance
bf58a4ad Prepare cfp for 4D arrays
02a403ee Add testzfp tests for 4D array classes
fdd14c9e Add 4D compressed-array classes
044cbe14 Add offset dereference operator tests for pointer and iterators
35d5c8cb Fix copy-paste error in iterator3
9761ee91 Update docs for iterators and views
cf899daf Fix bad typedefs in iterators
9bf99934 Refactor accessors using container template
29cbdf76 Changed cfp to use safer static casting
6bd6d724 Change to cfp header/array typedefs to solve issue with pre gcc 4.8
fa0d09c4 add cfp_header stubs to prep for new zfp header class
5b687458 Promote all iterators to random access; prepare for view iterators
19720bf7 Clean up documentation
520596bd Fix incorrect reads through private_view
f3e1ca9a Fix zfp_stream_maximum_size() estimate in reversible mode
ca97dfdc Extra cleanup & organization pass for cfp tests/docs
d1d185c1 fix iteratorC formatting to match new output types
97749bd0 Add missing functionality for 2d/3d cfp_iter, move cfp fully from uint/int to size_t/ptrdiff_t
3bdb7fa3 zfpy 0.5.5
8f81ae47 Update FAQ on relative error
2dafa9f8 Update cfp pointer docs
8487b618 Add missing functionality for 2d/3d cfp_ptr
d83c2c81 Fix cfp naming consistency, add missing 1d functionality
68bc279b Mention const pointer availability
b5255c5a Update docs to reflect 64-bit support
0df472c3 Small tweak to cfp docs
2104f25c Add const_pointer tests
97818431 Add const iterator tests
0086df3a Add pointer comparison tests
a29e83a9 Const accessor constructors must accept const containers
3a33078b zfpy 0.5.5-rc5
cf8740a1 zfp 0.5.5-rc4
94ac5c00 Add alignment, header tests
739ea17c zfpy 0.5.5rc3
61048ab2 zfpy rc2
37096f14 Update docs for new const accessors, serialization, etc.
9d0d1df3 Fixed iteratorC to match output with iterator example
1167e3d5 Docs changes: fix ptr/iter names, add missing functions, fix broken links
ba0d5f46 Add zfpy-wheels long description
64f1b58a Update version
8d0f76fd Windows friendly setup.py
33f9c665 Add URL
0f148ecf Build wheel with setup.py
35b5cafd Workaround for conflicting container_type declarations
ad613305 Resolve conflicting container_type in private views
1c4b9d9b Replace array with container_type in views
2ecc4686 Make accessors return const_reference; add pointers to views
cc332853 cfp accessor documentation first pass
27aca9c3 Fix iterator ~3u bug
20d81aa2 Add full 64-bit support to compressed arrays
da4a71f2 Silence type conversion warnings
f7893523 Rewrite zfp codec to use zfp::cpp templates
80b7841a Add C++ wrappers around low-level C API
470a743d Add typename to please MSVC 14
6a7ee093 Change header API to rely on constructor
e30a70a7 Replace last occurrence of stream_word_bits
45c508d0 Plug memory leak in zfp::array::construct
9548b940 Plug memory leak in block store
abc6922e Clean up codec and use stream_alignment()
90009fd0 Add stream_alignment() function
4e6f0315 Fix using declarations for MSVC
a42b52d6 Work around googletest iterator bug
f6a8be00 Update compressed_data() API in cfp
bc55d5e7 Fix codec constructors
4d4001e8 Remove obsolete code
81e577dd Refactor array serialization
24270e88 Add Boolean type
229a37df Respect cache size request
6f0b78e4 Move codec specializations
6e7c12b5 Use std:: prefix in memory allocation where appropriate
878b329c Add type traits
3539cc84 Reinstate private views
4f2525c9 Fix incorrect cache size initialization
a688d9de Initialize array dimensions in constructor from view
efb78225 Move codec from BlockStore to BlockCache
086a156a Rename storage -> store
d8b099c9 Fix codec source file names
a7b5a6d5 Update tests to support new array API
d2ca8ae6 Add new block storage + cache and refactor arrays + views
ecd4fb49 Add specialized allocator for void*
3b12f7c0 Made zfp memory alignment a compile-time macro
91e702f7 Add missing cbegin/cend to 2D and 3D arrays
b98c864b Update 3D references, pointers, iterators, and views
a2a7a2cf Fix incorrect pointer2 comparisons
e8e1000c Update 2D references, pointers, iterators, and views
b05afe92 Fix inconsistent relational operators
47e4c7ce Replace 1D view_reference with new generic reference
90eff650 Traverse array using new const iterator
aacc4989 Refactor 1D references, pointers, iterators and add const versions
079c409f Fix Cython 'Assignment to non-lvalue' errors
2d787371 Remove trailing semicolons which Cython treats as errors
a1a876c9 Fix make install clobbering existing gtest installations
fc0ecac3 Optimized cfp references and arrays based on results of testing
c4029cf4 Added cfp iterator example
08adb27d Fix typo in CXXFLAGS
0202b0be Check for undefined __STDC_VERSION__ to suppress warnings
bc2fe6bf Add reversible mode to simple.c
1bb7955a Minor documentation changes
5254963d Added iterator tests
18f6b8ad Added proxy iterators
bbcded58 Fix AppVeyor badge URL
74e8a145 Fix bad URLs
e1b45152 Reorganize README file
a05e2e52 Update license info per LLNL guidelines
bcea8930 Update Appveyor project URL
05ef3411 Added tests for pointers and array flat ptr/ref
26be0ff9 Added pointer shifting/arithmetic and flat array ref/ptr accessors
c8969dfa changed references to linear indexing
4079af78 Implemented cfp pointers
5710f291 Removed unnecessary export_ macro.
ee7ecc23 Revert "Minor optimization of lifting transform."
92fca13d Minor optimization of lifting transform.
4cb27e2a Fixed linker errors.
aa626692 Fixed LNK4217 linker warnings when using Visual Studio. Also removes .lib and .exp files created along executable file of a project using static zfp library.
b515bd11 Fixed linker error LNK1114 when using Visual Studio. The fix required a change of name of zfp command line executable.
2ab06b3d hotfix: Fix wrong memset params order
b1a4f8a0 set and copy added to cfp references, diffusion example updated to use new API
d62a7f9e added get_ref to cfp arrays and get (value) to cfp refs (with associated tests)
af30bdc2 Changed cfp_array initialization syntax to fix windows build issue
28decba3 cfp_arrayALL changed from raw pointer to struct wrapped pointer (prep for cfp refs)
e748a9c9 Clean up code
edf135f5 Add compressed block index
bcf97058 Add variable rate support
f80c5103 Add uncompressed block index
40ae50ce Return block size in codec
e9f4743c Initial pass for 3D arrays
40ef12b7 Merge branch 'develop' of github.com:LLNL/zfp into develop
814c1108 Add reversible mode to simple.c
6fe05a5d Minor documentation changes
fa79823c Merge branch 'misc/checksum-gen-tool' into develop
1c6144a9 updated cython for new checksum API
b82d9e83 TODO: update cython for new test utils APIs
1a32165e Update cpp and CFP tests to conform to new checksum API
e75a0b11 Extend zfpChecksumsLib API for original-input specific function (omits irrelevant parameters)
32b4637a Checksums now support tuples (key1, key2, value), where new key2 encodes array dimension lengths. Encode, decode, endtoend all updated
93b602ce Checksum (key, value) lookup working for encode, decode, and endtoend tests. zfpChecksumsLib updated to handle (key, value) lookup
9e8a000e Checksum generation working for encode, decode, endtoend tests, across {dimensionalities 1-4}x{all scalar types}
00b944b0 endtoend tests can print (key, value) = (descriptor, checksum) pairs when compiled with flag PRINT_CHECKSUMS
881d8418 Update links to zfp website
6da6eff2 updated cython for new checksum API
d85b6fa4 Remove redundant CUDA h2d transfers
3119f94d add short mention of third-party tools in overview
a8def9af adding a short note and link for h5z-zfp
068f8150 Merge pull request #61 from maddyscientist/feature/cuda_encode_unroll
110a4945 Merge pull request #67 from maddyscientist/feature/cuda_decode_unroll
4cfd5882 TODO: update cython for new test utils APIs
070e4e4b Update cpp and CFP tests to conform to new checksum API
cf8c779d Extend zfpChecksumsLib API for original-input specific function (omits irrelevant parameters)
bf577f52 Checksums now support tuples (key1, key2, value), where new key2 encodes array dimension lengths. Encode, decode, endtoend all updated
64ef0ea9 Checksum (key, value) lookup working for encode, decode, and endtoend tests. zfpChecksumsLib updated to handle (key, value) lookup
208c6bbf Checksum generation working for encode, decode, endtoend tests, across {dimensionalities 1-4}x{all scalar types}
680fbb94 endtoend tests can print (key, value) = (descriptor, checksum) pairs when compiled with flag PRINT_CHECKSUMS
034f37bd Fix missing CUDA test function from previous refactor commit
047da679 Merge branch 'refactor/combine-testcases' into develop
7b75e6ce Remove fatal fail_msg() calls from setupCompressParam() becuase it would terminate some looping tests early
ec777124 Minor test changes: print uint64 in hex format, add "ERROR: " prefix to nonfatal print statements
cb784f8f Refactor compress/decompress, and hash bitstream/array to separate functions for re-use in endtoend tests
7c751efa Combine endtoend testcases for CUDA, similarly to past 2 commits. Also perform some refactoring across endtoend testcode
a608eb9b Combine testcases for OpenMP endtoend tests, such that one testcase now tests 1 compression mode, varying the compression parameter (3), threadcount (3), and chunk size (3).
32fa7e76 Combine testcases for serial endtoend tests, such that one test case tests a compression mode across all 3 compression parameters, covering all compress/decompress checksum comparisons, without prematurely ending at the first failure
045ca9cb Refactor encode/decode block special value tests (10 testcases become 1)
a7e77cc1 Fix typos in CFP documentation
ede3bee7 Merge branch 'develop' into feature/cuda_decode_unroll
989e1aa6 Merge pull request #1 from LLNL/develop
b7dcdb01 Unroll optimization for CUDA encode scatter partial functions
ae26c99c Merge pull request #65 from maddyscientist/feature/cuda_inline_constants
76112759 Let the compiler inline the permutation lookup table into registers rather than using __constant__ memory
4444fd30 Fix C++ undefined behavior with last commit.  All tests now pass.
bee1d74f Add support for unbiased errors and tighter error bounds
9cb007ff CUDA optimization: unrolling optimization gather_partial functions
f7632023 Fix RuntimeError using int64 in zfpy on Windows (Pull Request #60)
d488a6f6 Appveyor: build zfpy on MSVC release builds with python 2.7 and 3.5 (expect zfpy failures on MSVC)
8f175692 Fix mixed declarations and code
c0a7acc4 Update contact email
5384b571 Merge branch 'release0.5.5' into develop

git-subtree-dir: src/zfp
git-subtree-split: f39af72648a2aeb88e9b2cca8c64f51b493ad5f4
---
 .github/workflows/main.yml                    | 107 +++
 .gitignore                                    |   4 +
 .travis.yml                                   | 303 ------
 CHANGELOG.md                                  | 409 +++++++++
 CITATION.cff                                  |  17 +
 CMakeLists.txt                                | 106 ++-
 CONTRIBUTING.md                               |  13 +
 CTestConfig.cmake                             |   3 +
 Config                                        |  86 +-
 LICENSE                                       |  66 +-
 Makefile                                      |   4 +-
 NOTICE                                        |  21 +
 README.md                                     | 196 ++--
 SUPPORT.md                                    |  11 +
 VERSIONS.md                                   | 298 ------
 appveyor.sh                                   |  18 +-
 appveyor.yml                                  |  75 +-
 array/zfp/header.h                            |  19 -
 array/zfp/headerHelpers.h                     | 144 ---
 array/zfp/iterator1.h                         |  38 -
 array/zfp/iterator2.h                         |  42 -
 array/zfp/iterator3.h                         |  50 -
 array/zfp/memory.h                            | 145 ---
 array/zfp/pointer1.h                          |  30 -
 array/zfp/pointer2.h                          |  42 -
 array/zfp/pointer3.h                          |  48 -
 array/zfp/reference1.h                        |  27 -
 array/zfp/reference2.h                        |  27 -
 array/zfp/reference3.h                        |  27 -
 array/zfp/view1.h                             | 291 ------
 array/zfp/view2.h                             | 393 --------
 array/zfp/view3.h                             | 445 ---------
 array/zfparray.h                              | 286 ------
 array/zfparray1.h                             | 297 ------
 array/zfparray2.h                             | 324 -------
 array/zfparray3.h                             | 338 -------
 array/zfpcodec.h                              |  17 -
 array/zfpcodecd.h                             | 149 ---
 array/zfpcodecf.h                             | 149 ---
 array/zfpfactory.h                            |  98 --
 cfp/CMakeLists.txt                            |  37 +-
 cfp/{src => }/Makefile                        |  12 +-
 cfp/cfp.cpp                                   | 868 ++++++++++++++++++
 cfp/cfparray1d.cpp                            |  21 +
 cfp/cfparray1f.cpp                            |  21 +
 cfp/cfparray2d.cpp                            |  21 +
 cfp/cfparray2f.cpp                            |  21 +
 cfp/cfparray3d.cpp                            |  21 +
 cfp/cfparray3f.cpp                            |  21 +
 cfp/cfparray4d.cpp                            |  21 +
 cfp/cfparray4f.cpp                            |  21 +
 cfp/cfpheader.cpp                             |  21 +
 cfp/include/cfparray1d.h                      |  37 -
 cfp/include/cfparray1f.h                      |  37 -
 cfp/include/cfparray2d.h                      |  39 -
 cfp/include/cfparray2f.h                      |  39 -
 cfp/include/cfparray3d.h                      |  40 -
 cfp/include/cfparray3f.h                      |  40 -
 cfp/include/cfparrays.h                       |  28 -
 cfp/src/CMakeLists.txt                        |  38 -
 cfp/src/cfparray1_source.cpp                  |  23 -
 cfp/src/cfparray1d.cpp                        |  15 -
 cfp/src/cfparray1f.cpp                        |  15 -
 cfp/src/cfparray2_source.cpp                  |  35 -
 cfp/src/cfparray2d.cpp                        |  15 -
 cfp/src/cfparray2f.cpp                        |  15 -
 cfp/src/cfparray3_source.cpp                  |  41 -
 cfp/src/cfparray3d.cpp                        |  15 -
 cfp/src/cfparray3f.cpp                        |  15 -
 cfp/src/cfparray_source.cpp                   | 106 ---
 cfp/src/cfparrays.cpp                         | 183 ----
 cfp/template/cfparray.cpp                     | 136 +++
 cfp/template/cfparray1.cpp                    | 332 +++++++
 cfp/template/cfparray2.cpp                    | 468 ++++++++++
 cfp/template/cfparray3.cpp                    | 522 +++++++++++
 cfp/template/cfparray4.cpp                    | 576 ++++++++++++
 cfp/template/cfpheader.cpp                    | 166 ++++
 cmake/appveyor.cmake                          |  15 +
 cmake/travis.cmake                            |  87 --
 examples/CMakeLists.txt                       |  21 +-
 examples/Makefile                             |  47 +-
 examples/array.cpp                            |  42 +
 examples/array2d.h                            |  58 --
 examples/array2d.hpp                          |  72 ++
 examples/diffusion.cpp                        | 422 ++++++---
 examples/diffusionC.c                         | 218 +++--
 examples/inplace.c                            |   2 +-
 examples/iterator.cpp                         |  10 +-
 examples/iteratorC.c                          |  97 ++
 examples/pgm.c                                |   2 +-
 examples/ppm.c                                | 390 ++++++++
 examples/simple.c                             |  25 +-
 examples/speed.c                              |   2 +-
 fortran/CMakeLists.txt                        |  16 +-
 fortran/Makefile                              |  16 +-
 fortran/{zfp.f => zfp.f90}                    | 360 +++++---
 include/zfp.h                                 | 507 +++++-----
 include/zfp.hpp                               | 289 ++++++
 include/zfp/array.h                           |  32 +
 include/zfp/array.hpp                         |  95 ++
 include/zfp/array1.hpp                        | 265 ++++++
 include/zfp/array2.hpp                        | 301 ++++++
 include/zfp/array3.hpp                        | 316 +++++++
 include/zfp/array4.hpp                        | 331 +++++++
 include/{ => zfp}/bitstream.h                 |  38 +-
 .../bitstream.c => include/zfp/bitstream.inl  | 140 +--
 include/zfp/codec/gencodec.hpp                | 421 +++++++++
 include/zfp/codec/zfpcodec.hpp                | 551 +++++++++++
 include/zfp/constarray1.hpp                   | 265 ++++++
 include/zfp/constarray2.hpp                   | 288 ++++++
 include/zfp/constarray3.hpp                   | 300 ++++++
 include/zfp/constarray4.hpp                   | 312 +++++++
 include/zfp/factory.hpp                       | 119 +++
 include/zfp/index.hpp                         | 537 +++++++++++
 .../zfp/internal/array/cache.hpp              |  50 +-
 include/zfp/internal/array/cache1.hpp         | 201 ++++
 include/zfp/internal/array/cache2.hpp         | 207 +++++
 include/zfp/internal/array/cache3.hpp         | 213 +++++
 include/zfp/internal/array/cache4.hpp         | 219 +++++
 include/zfp/internal/array/exception.hpp      |  18 +
 include/zfp/internal/array/handle1.hpp        |  38 +
 include/zfp/internal/array/handle2.hpp        |  38 +
 include/zfp/internal/array/handle3.hpp        |  38 +
 include/zfp/internal/array/handle4.hpp        |  38 +
 include/zfp/internal/array/header.hpp         |  41 +
 include/zfp/internal/array/iterator1.hpp      | 137 +++
 include/zfp/internal/array/iterator2.hpp      | 230 +++++
 include/zfp/internal/array/iterator3.hpp      | 265 ++++++
 include/zfp/internal/array/iterator4.hpp      | 300 ++++++
 include/zfp/internal/array/memory.hpp         | 200 ++++
 include/zfp/internal/array/pointer1.hpp       | 118 +++
 include/zfp/internal/array/pointer2.hpp       | 136 +++
 include/zfp/internal/array/pointer3.hpp       | 145 +++
 include/zfp/internal/array/pointer4.hpp       | 154 ++++
 include/zfp/internal/array/reference1.hpp     |  78 ++
 include/zfp/internal/array/reference2.hpp     |  80 ++
 include/zfp/internal/array/reference3.hpp     |  82 ++
 include/zfp/internal/array/reference4.hpp     |  84 ++
 include/zfp/internal/array/store.hpp          | 255 +++++
 include/zfp/internal/array/store1.hpp         | 140 +++
 include/zfp/internal/array/store2.hpp         | 147 +++
 include/zfp/internal/array/store3.hpp         | 154 ++++
 include/zfp/internal/array/store4.hpp         | 161 ++++
 include/zfp/internal/array/traits.hpp         |  30 +
 include/zfp/internal/array/view1.hpp          | 303 ++++++
 include/zfp/internal/array/view2.hpp          | 498 ++++++++++
 include/zfp/internal/array/view3.hpp          | 584 ++++++++++++
 include/zfp/internal/array/view4.hpp          | 679 ++++++++++++++
 include/zfp/internal/cfp/array1d.h            | 141 +++
 include/zfp/internal/cfp/array1f.h            | 141 +++
 include/zfp/internal/cfp/array2d.h            | 144 +++
 include/zfp/internal/cfp/array2f.h            | 144 +++
 include/zfp/internal/cfp/array3d.h            | 146 +++
 include/zfp/internal/cfp/array3f.h            | 146 +++
 include/zfp/internal/cfp/array4d.h            | 148 +++
 include/zfp/internal/cfp/array4f.h            | 148 +++
 include/zfp/internal/cfp/header.h             |   8 +
 include/zfp/internal/codec/genheader.hpp      |  76 ++
 include/zfp/internal/codec/zfpheader.hpp      | 129 +++
 .../zfp/internal/zfp}/inline.h                |   4 +-
 include/zfp/{ => internal/zfp}/macros.h       |   0
 include/zfp/{ => internal/zfp}/system.h       |  24 +-
 include/zfp/{ => internal/zfp}/types.h        |  27 +-
 include/zfp/version.h                         |  49 +
 python/CMakeLists.txt                         |   8 +-
 python/eyescale-cmake/FindNumPy.cmake         |  41 -
 python/eyescale-cmake/LICENSE.txt             |  26 -
 python/scikit-build-cmake/FindCython.cmake    |   9 +-
 python/scikit-build-cmake/FindNumPy.cmake     | 106 +++
 .../FindPythonExtensions.cmake                |  21 +-
 python/scikit-build-cmake/LICENSE             |   3 -
 python/scikit-build-cmake/UseCython.cmake     |  32 +-
 ...targetLinkLibrariesWithDynamicLookup.cmake |   2 +-
 python/zfpy.pxd                               |  73 +-
 python/zfpy.pyx                               |  26 +-
 setup.py                                      |  15 +
 src/CMakeLists.txt                            |  17 +-
 src/Makefile                                  |   2 +-
 src/bitstream.c                               |   6 +-
 src/cuda_zfp/CMakeLists.txt                   |   1 -
 src/cuda_zfp/constant_setup.cuh               |  39 -
 src/cuda_zfp/constants.h                      |   6 +-
 src/cuda_zfp/cuZFP.cu                         |  84 +-
 src/cuda_zfp/decode.cuh                       | 154 ++--
 src/cuda_zfp/decode1.cuh                      |  10 +-
 src/cuda_zfp/decode2.cuh                      |  18 +-
 src/cuda_zfp/decode3.cuh                      |  23 +-
 src/cuda_zfp/encode.cuh                       | 100 +-
 src/cuda_zfp/encode1.cuh                      |  12 +-
 src/cuda_zfp/encode2.cuh                      |  21 +-
 src/cuda_zfp/encode3.cuh                      |  31 +-
 src/cuda_zfp/shared.h                         |  51 +-
 src/cuda_zfp/type_info.cuh                    |  17 +-
 src/decode1d.c                                |   7 +-
 src/decode1f.c                                |   7 +-
 src/decode1i.c                                |   7 +-
 src/decode1l.c                                |   7 +-
 src/decode2d.c                                |   7 +-
 src/decode2f.c                                |   7 +-
 src/decode2i.c                                |   7 +-
 src/decode2l.c                                |   7 +-
 src/decode3d.c                                |   7 +-
 src/decode3f.c                                |   7 +-
 src/decode3i.c                                |   7 +-
 src/decode3l.c                                |   7 +-
 src/decode4d.c                                |   7 +-
 src/decode4f.c                                |   7 +-
 src/decode4i.c                                |   7 +-
 src/decode4l.c                                |   7 +-
 src/encode1d.c                                |   7 +-
 src/encode1f.c                                |   7 +-
 src/encode1i.c                                |   7 +-
 src/encode1l.c                                |   7 +-
 src/encode2d.c                                |   7 +-
 src/encode2f.c                                |   7 +-
 src/encode2i.c                                |   7 +-
 src/encode2l.c                                |   7 +-
 src/encode3d.c                                |   7 +-
 src/encode3f.c                                |   7 +-
 src/encode3i.c                                |   7 +-
 src/encode3l.c                                |   7 +-
 src/encode4d.c                                |   7 +-
 src/encode4f.c                                |   7 +-
 src/encode4i.c                                |   7 +-
 src/encode4l.c                                |   7 +-
 src/share/omp.c                               |  20 +-
 src/share/parallel.c                          |  68 +-
 src/template/codec.c                          |   6 +
 src/template/codecf.c                         |   8 +-
 src/template/compress.c                       |  54 +-
 src/template/cudacompress.c                   |  16 +-
 src/template/cudadecompress.c                 |  16 +-
 src/template/decode.c                         | 194 +++-
 src/template/decode1.c                        |  32 +-
 src/template/decode2.c                        |  30 +-
 src/template/decode3.c                        |  30 +-
 src/template/decode4.c                        |  30 +-
 src/template/decodef.c                        |  10 +-
 src/template/decodei.c                        |   4 +-
 src/template/decompress.c                     |  54 +-
 src/template/encode.c                         | 157 +++-
 src/template/encode1.c                        |  34 +-
 src/template/encode2.c                        |  34 +-
 src/template/encode3.c                        |  34 +-
 src/template/encode4.c                        |  34 +-
 src/template/encodef.c                        |  26 +-
 src/template/encodei.c                        |   4 +-
 src/template/ompcompress.c                    | 150 +--
 src/template/revdecode.c                      |  11 +-
 src/template/revdecodef.c                     |   7 +-
 src/template/revencode.c                      |  13 +-
 src/template/revencodef.c                     |   6 +-
 src/traitsd.h                                 |   1 +
 src/traitsf.h                                 |   1 +
 src/zfp.c                                     | 353 +++++--
 tests/Makefile                                |   5 +-
 tests/gitlab/corona-jobs.yml                  |  17 +
 tests/gitlab/corona-templates.yml             |  12 +
 tests/gitlab/gitlab-ci.yml                    | 126 +++
 tests/gitlab/pascal-jobs.yml                  |  16 +
 tests/gitlab/pascal-templates.yml             |  12 +
 tests/gitlab/quartz-jobs.yml                  |  77 ++
 tests/gitlab/quartz-templates.yml             |  12 +
 tests/testzfp.cpp                             | 126 ++-
 travis.sh                                     |  60 --
 utils/CMakeLists.txt                          |  12 +-
 utils/Makefile                                |   4 +-
 utils/zfp.c                                   |  64 +-
 zfp-config-version.cmake.in                   |   2 +
 zfp-config.cmake.in                           |  13 +-
 270 files changed, 19943 insertions(+), 7020 deletions(-)
 create mode 100644 .github/workflows/main.yml
 delete mode 100644 .travis.yml
 create mode 100644 CHANGELOG.md
 create mode 100644 CITATION.cff
 create mode 100644 CONTRIBUTING.md
 create mode 100644 NOTICE
 create mode 100644 SUPPORT.md
 delete mode 100644 VERSIONS.md
 delete mode 100644 array/zfp/header.h
 delete mode 100644 array/zfp/headerHelpers.h
 delete mode 100644 array/zfp/iterator1.h
 delete mode 100644 array/zfp/iterator2.h
 delete mode 100644 array/zfp/iterator3.h
 delete mode 100644 array/zfp/memory.h
 delete mode 100644 array/zfp/pointer1.h
 delete mode 100644 array/zfp/pointer2.h
 delete mode 100644 array/zfp/pointer3.h
 delete mode 100644 array/zfp/reference1.h
 delete mode 100644 array/zfp/reference2.h
 delete mode 100644 array/zfp/reference3.h
 delete mode 100644 array/zfp/view1.h
 delete mode 100644 array/zfp/view2.h
 delete mode 100644 array/zfp/view3.h
 delete mode 100644 array/zfparray.h
 delete mode 100644 array/zfparray1.h
 delete mode 100644 array/zfparray2.h
 delete mode 100644 array/zfparray3.h
 delete mode 100644 array/zfpcodec.h
 delete mode 100644 array/zfpcodecd.h
 delete mode 100644 array/zfpcodecf.h
 delete mode 100644 array/zfpfactory.h
 rename cfp/{src => }/Makefile (59%)
 create mode 100644 cfp/cfp.cpp
 create mode 100644 cfp/cfparray1d.cpp
 create mode 100644 cfp/cfparray1f.cpp
 create mode 100644 cfp/cfparray2d.cpp
 create mode 100644 cfp/cfparray2f.cpp
 create mode 100644 cfp/cfparray3d.cpp
 create mode 100644 cfp/cfparray3f.cpp
 create mode 100644 cfp/cfparray4d.cpp
 create mode 100644 cfp/cfparray4f.cpp
 create mode 100644 cfp/cfpheader.cpp
 delete mode 100644 cfp/include/cfparray1d.h
 delete mode 100644 cfp/include/cfparray1f.h
 delete mode 100644 cfp/include/cfparray2d.h
 delete mode 100644 cfp/include/cfparray2f.h
 delete mode 100644 cfp/include/cfparray3d.h
 delete mode 100644 cfp/include/cfparray3f.h
 delete mode 100644 cfp/include/cfparrays.h
 delete mode 100644 cfp/src/CMakeLists.txt
 delete mode 100644 cfp/src/cfparray1_source.cpp
 delete mode 100644 cfp/src/cfparray1d.cpp
 delete mode 100644 cfp/src/cfparray1f.cpp
 delete mode 100644 cfp/src/cfparray2_source.cpp
 delete mode 100644 cfp/src/cfparray2d.cpp
 delete mode 100644 cfp/src/cfparray2f.cpp
 delete mode 100644 cfp/src/cfparray3_source.cpp
 delete mode 100644 cfp/src/cfparray3d.cpp
 delete mode 100644 cfp/src/cfparray3f.cpp
 delete mode 100644 cfp/src/cfparray_source.cpp
 delete mode 100644 cfp/src/cfparrays.cpp
 create mode 100644 cfp/template/cfparray.cpp
 create mode 100644 cfp/template/cfparray1.cpp
 create mode 100644 cfp/template/cfparray2.cpp
 create mode 100644 cfp/template/cfparray3.cpp
 create mode 100644 cfp/template/cfparray4.cpp
 create mode 100644 cfp/template/cfpheader.cpp
 delete mode 100644 cmake/travis.cmake
 create mode 100644 examples/array.cpp
 delete mode 100644 examples/array2d.h
 create mode 100644 examples/array2d.hpp
 create mode 100644 examples/iteratorC.c
 create mode 100644 examples/ppm.c
 rename fortran/{zfp.f => zfp.f90} (78%)
 create mode 100644 include/zfp.hpp
 create mode 100644 include/zfp/array.h
 create mode 100644 include/zfp/array.hpp
 create mode 100644 include/zfp/array1.hpp
 create mode 100644 include/zfp/array2.hpp
 create mode 100644 include/zfp/array3.hpp
 create mode 100644 include/zfp/array4.hpp
 rename include/{ => zfp}/bitstream.h (64%)
 rename src/inline/bitstream.c => include/zfp/bitstream.inl (73%)
 create mode 100644 include/zfp/codec/gencodec.hpp
 create mode 100644 include/zfp/codec/zfpcodec.hpp
 create mode 100644 include/zfp/constarray1.hpp
 create mode 100644 include/zfp/constarray2.hpp
 create mode 100644 include/zfp/constarray3.hpp
 create mode 100644 include/zfp/constarray4.hpp
 create mode 100644 include/zfp/factory.hpp
 create mode 100644 include/zfp/index.hpp
 rename array/zfp/cache.h => include/zfp/internal/array/cache.hpp (82%)
 create mode 100644 include/zfp/internal/array/cache1.hpp
 create mode 100644 include/zfp/internal/array/cache2.hpp
 create mode 100644 include/zfp/internal/array/cache3.hpp
 create mode 100644 include/zfp/internal/array/cache4.hpp
 create mode 100644 include/zfp/internal/array/exception.hpp
 create mode 100644 include/zfp/internal/array/handle1.hpp
 create mode 100644 include/zfp/internal/array/handle2.hpp
 create mode 100644 include/zfp/internal/array/handle3.hpp
 create mode 100644 include/zfp/internal/array/handle4.hpp
 create mode 100644 include/zfp/internal/array/header.hpp
 create mode 100644 include/zfp/internal/array/iterator1.hpp
 create mode 100644 include/zfp/internal/array/iterator2.hpp
 create mode 100644 include/zfp/internal/array/iterator3.hpp
 create mode 100644 include/zfp/internal/array/iterator4.hpp
 create mode 100644 include/zfp/internal/array/memory.hpp
 create mode 100644 include/zfp/internal/array/pointer1.hpp
 create mode 100644 include/zfp/internal/array/pointer2.hpp
 create mode 100644 include/zfp/internal/array/pointer3.hpp
 create mode 100644 include/zfp/internal/array/pointer4.hpp
 create mode 100644 include/zfp/internal/array/reference1.hpp
 create mode 100644 include/zfp/internal/array/reference2.hpp
 create mode 100644 include/zfp/internal/array/reference3.hpp
 create mode 100644 include/zfp/internal/array/reference4.hpp
 create mode 100644 include/zfp/internal/array/store.hpp
 create mode 100644 include/zfp/internal/array/store1.hpp
 create mode 100644 include/zfp/internal/array/store2.hpp
 create mode 100644 include/zfp/internal/array/store3.hpp
 create mode 100644 include/zfp/internal/array/store4.hpp
 create mode 100644 include/zfp/internal/array/traits.hpp
 create mode 100644 include/zfp/internal/array/view1.hpp
 create mode 100644 include/zfp/internal/array/view2.hpp
 create mode 100644 include/zfp/internal/array/view3.hpp
 create mode 100644 include/zfp/internal/array/view4.hpp
 create mode 100644 include/zfp/internal/cfp/array1d.h
 create mode 100644 include/zfp/internal/cfp/array1f.h
 create mode 100644 include/zfp/internal/cfp/array2d.h
 create mode 100644 include/zfp/internal/cfp/array2f.h
 create mode 100644 include/zfp/internal/cfp/array3d.h
 create mode 100644 include/zfp/internal/cfp/array3f.h
 create mode 100644 include/zfp/internal/cfp/array4d.h
 create mode 100644 include/zfp/internal/cfp/array4f.h
 create mode 100644 include/zfp/internal/cfp/header.h
 create mode 100644 include/zfp/internal/codec/genheader.hpp
 create mode 100644 include/zfp/internal/codec/zfpheader.hpp
 rename {src/inline => include/zfp/internal/zfp}/inline.h (77%)
 rename include/zfp/{ => internal/zfp}/macros.h (100%)
 rename include/zfp/{ => internal/zfp}/system.h (59%)
 rename include/zfp/{ => internal/zfp}/types.h (78%)
 create mode 100644 include/zfp/version.h
 delete mode 100644 python/eyescale-cmake/FindNumPy.cmake
 delete mode 100644 python/eyescale-cmake/LICENSE.txt
 create mode 100644 python/scikit-build-cmake/FindNumPy.cmake
 create mode 100644 setup.py
 delete mode 100644 src/cuda_zfp/constant_setup.cuh
 create mode 100644 src/template/codec.c
 create mode 100644 tests/gitlab/corona-jobs.yml
 create mode 100644 tests/gitlab/corona-templates.yml
 create mode 100644 tests/gitlab/gitlab-ci.yml
 create mode 100644 tests/gitlab/pascal-jobs.yml
 create mode 100644 tests/gitlab/pascal-templates.yml
 create mode 100644 tests/gitlab/quartz-jobs.yml
 create mode 100644 tests/gitlab/quartz-templates.yml
 delete mode 100755 travis.sh

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 00000000..eeb42d06
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,107 @@
+name: build
+on: push
+env:
+  BUILD_TYPE: Release
+jobs:
+  build:
+    runs-on: ${{matrix.os}}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-latest
+            cxx_compiler: g++-10
+            c_compiler: gcc-10
+            omp: ON
+            target: all
+            
+          - os: ubuntu-latest
+            cxx_compiler: clang++
+            c_compiler: clang
+            omp: ON
+            target: all
+            
+          #- os: windows-latest
+          #  cxx_compiler: msbuild
+          #  c_compiler: msbuild
+          #  omp: ON
+          #  target: ALL_BUILD
+            
+          #- os: windows-latest
+          #  cxx_compiler: x86_64-w64-mingw32-g++
+          #  c_compiler: x86_64-w64-mingw32-gcc
+          #  omp: ON
+          #  target: all
+          #  generator: '-G "MinGW Makefiles"'
+            
+          - os: macos-latest
+            cxx_compiler: clang++
+            c_compiler: clang
+            omp: OFF
+            target: all
+          
+          - os: macos-latest
+            cxx_compiler: g++-10
+            c_compiler: gcc-10
+            omp: ON
+            target: all
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+          architecture: x64
+
+      - name: Install zfpy dependencies
+        run: |
+          python -m pip install cython
+          python -m pip install numpy
+      
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+          architecture: x64
+      
+      - name: Install zfpy dependencies
+        run: |
+          python -m pip install cython
+          python -m pip install numpy
+      
+      - name: Setup MSBuild (Windows)
+        id: msbuild
+        if: ${{matrix.os == 'windows-latest' && matrix.cxx_compiler == 'msbuild'}}
+        uses: microsoft/setup-msbuild@v1.0.3
+      
+      #- name: Setup MinGW (Windows)
+      #  id: mingw
+      #  if: ${{matrix.os == 'windows-latest' && matrix.cxx_compiler == 'x86_64-w64-mingw32-g++'}}
+      #  uses: egor-tensin/setup-mingw@v2
+      
+      - name: CI Settings
+        id: settings
+        run: |
+            echo "os: ${{matrix.os}}"
+            echo "compilers:"
+            echo "    cxx: ${{matrix.cxx_compiler}}"
+            echo "    c:   ${{matrix.c_compiler}}"
+            echo "OpenMP: ${{matrix.omp}}"
+      
+      - name: Run CMake
+        id: cmake
+        run: cmake -B ${{github.workspace}}/build ${{matrix.generator}} -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_CXX_COMPILER=${{matrix.cxx_compiler}} -DCMAKE_C_COMPILER=${{matrix.c_compiler}} -DBUILD_TESTING=ON -DZFP_WITH_OPENMP=${{matrix.omp}} -DBUILD_ZFPY=ON -DPYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())")  -DPYTHON_LIBRARY=$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))")
+        
+      - name: Build
+        id: build
+        run: cmake --build ${{github.workspace}}/build --target ${{matrix.target}} --config ${{env.BUILD_TYPE}}
+
+      - name: Run Tests
+        id: test
+        working-directory: ${{github.workspace}}/build
+        run: ctest -C ${{env.BUILD_TYPE}} -VV
+      
+      # Interactive Debug -> see: https://github.com/mxschmitt/action-tmate
+      #- name: Setup Debug Session
+      #  uses: mxschmitt/action-tmate@v3
diff --git a/.gitignore b/.gitignore
index 581f2abe..66f13148 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,7 @@
 bin
 build
 lib
+dist
+wheelhouse
+zfpy.egg-info
+modules
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 40287688..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,303 +0,0 @@
-language:
-  - generic
-
-matrix:
-  include:
-    - os: linux
-      dist: xenial
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' COVERAGE='ON'
-
-    - os: linux
-      dist: xenial
-      compiler: clang-3.6
-      addons: &clang36
-        apt:
-          sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty']
-          packages:
-            - clang-3.6
-            - g++-7
-            - gfortran-6
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='clang-3.6' CXX='clang++-3.6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: xenial
-      compiler: clang-4.0
-      before_install:
-        - export LD_LIBRARY_PATH=/usr/local/clang/lib:$LD_LIBRARY_PATH
-      addons: &clang40
-        apt:
-          sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-trusty-4.0']
-          packages:
-            - clang-4.0
-            - g++-7
-            - gfortran-6
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='clang-4.0' CXX='clang++-4.0' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-4.4
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-4.4
-            - g++-4.4
-            - gfortran-4.4
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-4.4' CXX='g++-4.4' FC='gfortran-4.4' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-4.7
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-4.7
-            - g++-4.7
-            - gfortran-4.7
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-4.7' CXX='g++-4.7' FC='gfortran-4.7' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-4.8
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-4.8
-            - g++-4.8
-            - gfortran-4.8
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-4.8' CXX='g++-4.8' FC='gfortran-4.8' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-4.9
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-4.9
-            - g++-4.9
-            - gfortran-4.9
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-4.9' CXX='g++-4.9' FC='gfortran-4.9' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: trusty
-      compiler: gcc-5
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-5
-            - g++-5
-            - gfortran-5
-            - libpython2.7
-            - python-pip
-      env: CC='gcc-5' CXX='g++-5' FC='gfortran-5' FORTRAN_STANDARD='2003' PYTHON_VERSION='2.7'
-
-    - os: linux
-      dist: trusty
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython2.7
-            - python-pip
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='2.7'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' C_STANDARD='90'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' C_STANDARD='11'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython3.5-dev
-            - cython3
-            - python3-numpy
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' CXX_STANDARD='11'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython3.5
-            - cython3
-            - python3-numpy
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2003' PYTHON_VERSION='3.5' CXX_STANDARD='14'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-6
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-6
-            - g++-6
-            - gfortran-6
-            - libpython3.5
-            - cython3
-            - python3-numpy
-      env: CC='gcc-6' CXX='g++-6' FC='gfortran-6' FORTRAN_STANDARD='2008' PYTHON_VERSION='3.5'
-
-    - os: linux
-      dist: xenial
-      compiler: gcc-7
-      addons:
-        apt:
-          sources: ubuntu-toolchain-r-test
-          packages:
-            - gcc-7
-            - g++-7
-            - gfortran-7
-            - libpython3.5
-            - cython3
-            - python3-numpy
-      env: CC='gcc-7' CXX='g++-7' FC='gfortran-7' FORTRAN_STANDARD='2008' PYTHON_VERSION='3.5'
-
-    - os: osx
-      osx_image: xcode7.3
-      compiler: gcc
-      env: CC='gcc' CXX='g++' PYTHON_VERSION='3.5'
-
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env: CC='gcc' CXX='g++' PYTHON_VERSION='2.7'
-
-    - os: osx
-      osx_image: xcode7.3
-      compiler: clang
-      env: CC='clang' CXX='clang++' PYTHON_VERSION='3.5'
-
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env: CC='clang' CXX='clang++' PYTHON_VERSION='2.7'
-
-script:
-  - if [ "$TRAVIS_OS_NAME" == "osx" ]; then pyenv root; fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "osx" ] && [ "$PYTHON_VERSION" = "2.7" ]; then
-        pyenv install 2.7.12;
-        export PYTHON_INCLUDE_DIR=$(pyenv root)/versions/2.7.12/include/python2.7;
-        export PYTHON_LIBRARY=$(pyenv root)/versions/2.7.12/lib/libpython2.7.dylib;
-        export PYTHON_EXECUTABLE=$(pyenv root)/versions/2.7.12/bin/python2.7;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "osx" ] && [ "$PYTHON_VERSION" = "3.5" ]; then
-        pyenv install 3.5.0;
-        export PYTHON_INCLUDE_DIR=$(pyenv root)/versions/3.5.0/include/python3.5m;
-        export PYTHON_LIBRARY=$(pyenv root)/versions/3.5.0/lib/libpython3.5m.a;
-        export PYTHON_EXECUTABLE=$(pyenv root)/versions/3.5.0/bin/python3.5m;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" == "osx" ]; then
-        $PYTHON_EXECUTABLE -m pip install --upgrade pip;
-        $PYTHON_EXECUTABLE -m pip install -r ${TRAVIS_BUILD_DIR}/python/requirements.txt;
-    fi
-
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ]; then
-        export PYTHON_EXECUTABLE=/usr/bin/python$PYTHON_VERSION;
-        source /etc/lsb-release;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$PYTHON_VERSION" = "2.7" ]; then
-        export PYTHON_INCLUDE_DIR=/usr/include/python2.7;
-        export PYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython2.7.so;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$PYTHON_VERSION" = "3.5" ]; then
-        export PYTHON_INCLUDE_DIR=/usr/include/python3.5m;
-        export PYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.5m.so;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$DISTRIB_CODENAME" = "trusty" ] && [ "$PYTHON_VERSION" = "2.7" ]; then
-        sudo $PYTHON_EXECUTABLE -m pip install --upgrade pip;
-        sudo $PYTHON_EXECUTABLE -m pip install -r ${TRAVIS_BUILD_DIR}/python/requirements.txt;
-    fi
-  - |
-    if [ "$TRAVIS_OS_NAME" = "linux" ] && [ "$DISTRIB_CODENAME" = "trusty" ] && [ "$PYTHON_VERSION" = "3.5" ]; then
-        echo "Python 3.5 not supported on Ubuntu Trusty";
-        exit 1;
-    fi
-
-  - printenv | grep PYTHON
-  - ./travis.sh
-
-after_success:
-  - if [[ -n "${COVERAGE}" ]]; then bash <(curl -s https://codecov.io/bash); fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..3251d515
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,409 @@
+Change Log
+==========
+
+---
+
+## 1.0.0 (2022-08-01)
+
+This release is not ABI compatible with prior releases due to numerous changes
+to function signatures and data structures like `zfp_field`.  However, few of
+the API changes, other than to the cfp C API for compressed arrays, should
+impact existing code.  Note that numerous header files have been renamed or
+moved relative to prior versions.
+
+### Added
+
+- `zfp::const_array`: read-only variable-rate array that supports
+  fixed-precision, fixed-accuracy, and reversible modes.
+- Compressed-array classes for 4D data.
+- `const` versions of array references, pointers, and iterators.
+- A more complete API for pointers and iterators.
+- cfp support for proxy references and pointers, iterators, and
+  (de)serialization.
+- Support for pointers and iterators into array views.
+- `zfp::array::size_bytes()` allows querying the size of different components
+  of an array object (e.g., payload, cache, index, metadata, ...).
+- Templated C++ wrappers around the low-level C API.
+- A generic codec for storing blocks of uncompressed scalars in zfp's
+  C++ arrays.
+- Additional functions for querying `zfp_field` and `zfp_stream` structs.
+- `zfp_config`: struct that encapsulates compression mode and parameters.
+- Rounding modes for reducing bias in compression errors.
+- New examples: `array`, `iteratorC`, and `ppm`.
+
+### Changed
+
+- Headers from `array/`, `cfp/include/`, and `include/` have been renamed
+  and reorganized into a common `include/` directory.
+  - The libzfp API is now confined to `zfp.h`, `zfp.hpp`, and `zfp.mod`
+    for C, C++, and Fortran bindings, respectively.  These all appear in
+    the top-level `include/` directory upon installation.
+  - C++ headers now use a `.hpp` suffix; C headers use a `.h` suffix.
+  - C++ headers like `array/zfparray.h` have been renamed `zfp/array.hpp`.
+  - C headers like `cfp/include/cfparrays.h` have been renamed `zfp/array.h`.
+- `size_t` and `ptrdiff_t` replace `uint` and `int` for array sizes and
+  strides in the array classes and C/Fortran APIs.
+- `zfp_bool` replaces `int` as Boolean type in the C API.
+- `bitstream_offset` and `bitstream_size` replace `size_t` to ensure support
+  for 64-bit offsets into and lengths of bit streams.  Consequently, the
+  `bitstream` API has changed accordingly.
+- All array and view iterators are now random-access iterators.
+- Array inspectors now return `const_reference` rather than a scalar
+  type like `float` to allow obtaining a `const_pointer` to an element
+  of an immutable array.
+- `zfp::array::compressed_data()` now returns `void*` instead of `uchar*`.
+- The array (de)serialization API has been revised, resulting in new
+  `zfp::array::header` and `zfp::exception` classes with new exception
+  messages.
+- The array `codec` class is now responsible for all details regarding
+  compression.
+- The compressed-array C++ implementation has been completely refactored to
+  make it more modular, extensible, and reusable across array types.
+- Array block shapes are now computed on the fly rather than stored.
+- The cfp C API now wraps array objects in structs.
+- The zfpy Python API now supports the more general `memoryview` over
+  `bytes` objects for decompression.
+- The zFORp Fortran module name is now `zfp` instead of `zforp_module`.
+- Some command-line options for the `diffusion` example have changed.
+- CMake 3.9 or later is now required for CMake builds.
+
+### Removed
+
+- `zfp::array::get_header()` has been replaced with a `zfp::array::header`
+  constructor that accepts an array object.
+- `ZFP_VERSION_RELEASE` is no longer defined (use `ZFP_VERSION_PATCH`).
+
+### Fixed
+
+- #66: `make install` overwrites googletest.
+- #84: Incorrect order of parameters in CUDA `memset()`.
+- #86: C++ compiler warns when `__STDC_VERSION__` is undefined.
+- #87: `CXXFLAGS` is misspelled in `cfp/src/Makefile`.
+- #98: `zfp_stream_maximum_size()` underestimates size in reversible mode.
+- #99: Incorrect `private_view` reads due to missing writeback.
+- #109: Unused CPython array is incompatible with PyPy.
+- #112: PGI compiler bug causes issues with memory alignment.
+- #119: All-subnormal blocks may cause floating-point overflow.
+- #121: CUDA bit offsets are limited to 32 bits.
+- #122: `make install` does not install zfp command-line utility.
+- #125: OpenMP bit offsets are limited to 32 bits.
+- #126: `make install` does not install Fortran module.
+- #127: Reversible mode reports incorrect compressed block size.
+- #150: cmocka tests do not build on macOS.
+- #154: Thread safety is broken in `private_view` and `private_const_view`.
+- `ZFP_MAX_BITS` is off by one.
+- `diffusionC`, `iteratorC` are not being built with `gmake`.
+
+---
+
+## 0.5.5 (2019-05-05)
+
+### Added
+
+- Support for reversible (lossless) compression of floating-point and
+  integer data.
+- Methods for serializing and deserializing zfp's compressed arrays.
+- Python bindings for compressing NumPy arrays.
+- Fortran bindings to zfp's high-level C API.
+
+### Changed
+
+- The default compressed-array cache size is now a function of the total
+  number of array elements, irrespective of array shape.
+
+### Fixed
+
+- Incorrect handling of execution policy in zfp utility.
+- Incorrect handling of decompression via header in zfp utility.
+- Incorrect cleanup of device memory in CUDA decompress.
+- Missing tests for failing mallocs.
+- CMake does not install CFP when built.
+- `zfp_write_header()` and `zfp_field_metadata()` succeed even if array
+  dimensions are too large to fit in header.
+
+---
+
+## 0.5.4 (2018-10-01)
+
+### Added
+
+- Support for CUDA fixed-rate compression and decompression.
+- Views into compressed arrays for thread safety, nested array indexing,
+  slicing, and array subsetting.
+- C language bindings for compressed arrays.
+- Support for compressing and decompressing 4D data.
+
+### Changed
+
+- Execution policy now applies to both compression and decompression.
+- Compressed array accessors now return Scalar type instead of
+  `const Scalar&` to avoid stale references to evicted cache lines.
+
+### Fixed
+
+- Incorrect handling of negative strides.
+- Incorrect handling of arrays with more than 2^32 elements in zfp command-line
+  tool.
+- `bitstream` is not C++ compatible.
+- Minimum cache size request is not respected.
+
+---
+
+## 0.5.3 (2018-03-28)
+
+### Added
+
+- Support for OpenMP multithreaded compression (but not decompression).
+- Options for OpenMP execution in zfp command-line tool.
+- Compressed-array support for copy construction and assignment via deep
+  copies.
+- Virtual destructors to enable inheritance from zfp arrays.
+
+### Changed
+
+- `zfp_decompress()` now returns the number of compressed bytes processed so
+  far, i.e., the same value returned by `zfp_compress()`.
+
+---
+
+## 0.5.2 (2017-09-28)
+
+### Added
+
+- Iterators and proxy objects for pointers and references.
+- Example illustrating how to use iterators and pointers.
+
+### Changed
+
+- Diffusion example now optionally uses iterators.
+- Moved internal headers under array to `array/zfp`.
+- Modified 64-bit integer typedefs to avoid the C89 non-compliant `long long`
+  and allow for user-supplied types and literal suffixes.
+- Renamed compile-time macros that did not have a `ZFP` prefix.
+- Rewrote documentation in reStructuredText and added complete documentation
+  of all public functions, classes, types, and macros.
+
+### Fixed
+
+- Issue with setting stream word type via CMake.
+
+---
+
+## 0.5.1 (2017-03-28)
+
+This release primarily fixes a few minor issues but also includes changes in
+anticipation of a large number of planned future additions to the library.
+No changes have been made to the compressed format, which is backwards
+compatible with version 0.5.0.
+
+### Added
+
+- High-level API support for integer types.
+- Example that illustrates in-place compression.
+- Support for CMake builds.
+- Documentation that discusses common issues with using zfp.
+
+### Changed
+
+- Separated library version from CODEC version and added version string.
+- Corrected inconsistent naming of `BIT_STREAM` macros in code and
+  documentation.
+- Renamed some of the header bit mask macros.
+- `stream_skip()` and `stream_flush()` now return the number of bits skipped
+  or output.
+- Renamed `stream_block()` and `stream_delta()` to make it clear that they
+  refer to strided streams.  Added missing definition of
+  `stream_stride_block()`.
+- Changed `int` and `uint` types in places to use `ptrdiff_t` and `size_t`
+  where appropriate.
+- Changed API for `zfp_set_precision()` and `zfp_set_accuracy()` to not
+  require the scalar type.
+- Added missing `static` keyword in `decode_block()`.
+- Changed `testzfp` to allow specifying which tests to perform on the
+  command line.
+- Modified directory structure.
+
+### Fixed
+
+- Bug that prevented defining uninitialized arrays.
+- Incorrect computation of array sizes in `zfp_field_size()`.
+- Minor issues that prevented code from compiling on Windows.
+- Issue with fixed-accuracy headers that caused unnecessary storage.
+
+---
+
+## 0.5.0 (2016-02-29)
+
+This version introduces backwards incompatible changes to the CODEC.
+
+### Added
+
+- Modified CODEC to more efficiently encode blocks whose values are all
+  zero or are smaller in magnitude than the absolute error tolerance.
+  This allows representing "empty" blocks using only one bit each.
+- Added functions for compactly encoding the compression parameters
+  and field meta data, e.g., for producing self-contained compressed
+  streams.  Also added functions for reading and writing a header
+  containing these parameters.
+
+### Changed
+
+- Changed behavior of `zfp_compress()` and `zfp_decompress()` to not
+  automatically rewind the bit stream.  This makes it easier to concatenate
+  multiple compressed bit streams, e.g., when compressing vector fields or
+  multiple scalars together.
+- Changed the zfp example program interface to allow reading and writing
+  compressed streams, optionally with a header.  The zfp tool can now be
+  used to compress and decompress files as a stand alone utility.
+
+---
+
+## 0.4.1 (2015-12-28)
+
+### Added
+
+- Added `simple.c` as a minimal example of how to call the compressor.
+
+### Changed
+
+- Changed compilation of diffusion example to output two executables:
+  one with and one without compression.
+
+### Fixed
+
+- Bug that caused segmentation fault when compressing 3D arrays whose
+  dimensions are not multiples of four.  Specifically, arrays of dimensions
+  nx * ny * nz, with ny not a multiple of four, were not handled correctly.
+- Modified `examples/fields.h` to ensure standard compliance.  Previously,
+  C99 support was needed to handle the hex float constants, which are
+  not supported in C++98.
+
+---
+
+## 0.4.0 (2015-12-05)
+
+This version contains substantial changes to the compression algorithm that
+improve PSNR by about 6 dB and speed by a factor of 2-3.  These changes are
+not backward compatible with previous versions of zfp.
+
+### Added
+
+- Support for 31-bit and 63-bit integer data, as well as shorter integer types.
+- New examples for evaluating the throughput of the (de)compressor and for
+  compressing grayscale images in the pgm format.
+- Frequently asked questions.
+
+### Changed
+
+- Rewrote compression codec entirely in C to make linking and calling
+  easier from other programming languages, and to expose the low-level
+  interface through C instead of C++.  This necessitated significant
+  changes to the API as well.
+- Minor changes to the C++ compressed array API, as well as major
+  implementation changes to support the C library.  The namespace and
+  public types are now all in lower case.
+
+### Removed
+
+- Support for general fixed-point decorrelating transforms.
+
+---
+
+## 0.3.2 (2015-12-03)
+
+### Fixed
+
+- Bug in `Array::get()` that caused the wrong cached block to be looked up,
+  thus occasionally copying incorrect values back to parts of the array.
+
+---
+
+## 0.3.1 (2015-05-06)
+
+### Fixed
+
+- Rare bug caused by exponent underflow in blocks with no normal and some
+  subnormal numbers.
+
+---
+
+## 0.3.0 (2015-03-03)
+
+This version modifies the default decorrelating transform to one that uses
+only additions and bit shifts.  This new transform, in addition to being
+faster, also has some theoretical optimality properties and tends to improve
+rate distortion.  This change is not backwards compatible.
+
+### Added
+
+- Compile-time support for parameterized transforms, e.g., to support other
+  popular transforms like DCT, HCT, and Walsh-Hadamard.
+- Floating-point traits to reduce the number of template parameters.  It is
+  now possible to declare a 3D array as `Array3<float>`, for example.
+- Functions for setting the array scalar type and dimensions.
+- `testzfp` for regression testing.
+
+### Changed
+
+- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
+  Consequently Q1.62 fixed point can be used throughout.
+- Changed the order in which bits are emitted within each bit plane to be more
+  intelligent.  Group tests are now deferred until they are needed, i.e., just
+  before the value bits for the group being tested.  This improves the quality
+  of fixed-rate encodings, but has no impact on compressed size.
+- Made several optimizations to improve performance.
+- Consolidated several header files.
+
+---
+
+## 0.2.1 (2014-12-12)
+
+### Added
+
+- Win64 support via Microsoft Visual Studio compiler.
+- Documentation of the expected output for the diffusion example.
+
+### Changed
+
+- Made several minor changes to suppress compiler warnings.
+
+### Fixed
+
+- Broken support for IBM's `xlc` compiler.
+
+---
+
+## 0.2.0 (2014-12-02)
+
+The compression interface from `zfpcompress` was relocated to a separate
+library, called `libzfp`, and modified to be callable from C.  This API now
+uses a parameter object (`zfp_params`) to specify array type and dimensions
+as well as compression parameters.
+
+### Added
+
+- Several utility functions were added to simplify `libzfp` usage:
+  * Functions for setting the rate, precision, and accuracy.
+    Corresponding functions were also added to the `Codec` class.
+  * A function for estimating the buffer size needed for compression.
+- The `Array` class functionality was expanded:
+  * Support for accessing the compressed bit stream stored with an array,
+    e.g., for offline compressed storage and for initializing an already
+    compressed array.
+  * Functions for dynamically specifying the cache size.
+  * The default cache is now direct-mapped instead of two-way associative.
+
+### Fixed
+
+- Corrected the value of the lowest possible bit plane to account for both
+  the smallest exponent and the number of bits in the significand.
+- Corrected inconsistent use of rate and precision.  The rate refers to the
+  number of compressed bits per floating-point value, while the precision
+  refers to the number of uncompressed bits.  The `Array` API was changed
+  accordingly.
+
+---
+
+## 0.1.0 (2014-11-12)
+
+Initial beta release.
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..6e36e7cb
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,17 @@
+cff-version: 1.1.0
+message: "If you use this software, please cite it as below."
+authors:
+  - family-names: Lindstrom
+    given-names: Peter
+    orcid: https://orcid.org/0000-0003-3817-4199
+title: "Fixed-Rate Compressed Floating-Point Arrays"
+journal: "IEEE Transactions on Visualization and Computer Graphics"
+volume: 20
+number: 12
+start: 2674
+end: 2683
+year: 2014
+month: 12
+version: develop
+doi: 10.1109/TVCG.2014.2346458
+date-released: 2014-11-05
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47179fd8..2d0615ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,7 @@
-if(WIN32)
-  cmake_minimum_required(VERSION 3.4)
-else()
-  cmake_minimum_required(VERSION 3.1)
-endif()
+cmake_minimum_required(VERSION 3.9)
+
+# Enable MACOSX_RPATH by default
+cmake_policy(SET CMP0042 NEW)
 
 # Fail immediately if not using an out-of-source build
 if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
@@ -14,15 +13,23 @@ endif()
 #------------------------------------------------------------------------------#
 # Parse version number from zfp.h
 #------------------------------------------------------------------------------#
-file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/zfp.h _zfp_h_contents)
+file(READ ${CMAKE_CURRENT_SOURCE_DIR}/include/zfp/version.h _zfp_h_contents)
 string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_MAJOR[ \t]+([0-9]+).*"
      "\\1" ZFP_VERSION_MAJOR ${_zfp_h_contents})
 string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_MINOR[ \t]+([0-9]+).*"
     "\\1" ZFP_VERSION_MINOR ${_zfp_h_contents})
 string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_PATCH[ \t]+([0-9]+).*"
     "\\1" ZFP_VERSION_PATCH ${_zfp_h_contents})
-set(ZFP_VERSION
-  "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}")
+string(REGEX REPLACE ".*#define[ \t]+ZFP_VERSION_TWEAK[ \t]+([0-9]+).*"
+    "\\1" ZFP_VERSION_TWEAK ${_zfp_h_contents})
+
+if(${ZFP_VERSION_TWEAK} EQUAL 0)
+  set(ZFP_VERSION
+    "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}")
+else()
+  set(ZFP_VERSION
+    "${ZFP_VERSION_MAJOR}.${ZFP_VERSION_MINOR}.${ZFP_VERSION_PATCH}.${ZFP_VERSION_TWEAK}")
+endif()
 
 project(ZFP VERSION ${ZFP_VERSION})
 
@@ -54,7 +61,9 @@ if(MSVC)
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
   # Silence extraneous Visual Studio specific warnings
-  add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS /wd4146 /wd4305)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
+  add_compile_options(/wd4146)
+  add_compile_options(/wd4305)
 endif()
 
 # Suggest C99
@@ -108,19 +117,40 @@ set(ZFP_BIT_STREAM_WORD_SIZE 64 CACHE STRING
   "Use smaller bit stream word type for finer rate granularity")
 set_property(CACHE ZFP_BIT_STREAM_WORD_SIZE PROPERTY STRINGS "8;16;32;64")
 
+if(CMAKE_C_COMPILER_ID MATCHES "PGI|NVHPC")
+  # Use default alignment to address PGI compiler bug.
+  set(ZFP_CACHE_LINE_SIZE 0 CACHE STRING "Cache line alignment in bytes")
+  mark_as_advanced(ZFP_CACHE_LINE_SIZE)
+endif()
+
+set(PPM_CHROMA 2 CACHE STRING "Chroma block dimensionality for ppm example")
+set_property(CACHE PPM_CHROMA PROPERTY STRINGS "1;2")
+
+set(ZFP_ROUNDING_MODE ZFP_ROUND_NEVER CACHE STRING
+  "Rounding mode for reducing bias")
+set_property(CACHE ZFP_ROUNDING_MODE PROPERTY STRINGS "ZFP_ROUND_NEVER;ZFP_ROUND_FIRST;ZFP_ROUND_LAST")
+
+option(ZFP_WITH_DAZ "Treat subnormals as zero to avoid overflow" OFF)
+
 option(ZFP_WITH_CUDA "Enable CUDA parallel compression" OFF)
 
-option(ZFP_WITH_BIT_STREAM_STRIDED
-  "Enable strided access for progressive zfp streams" OFF)
+option(ZFP_WITH_BIT_STREAM_STRIDED "Enable strided access for progressive zfp streams" OFF)
+mark_as_advanced(ZFP_WITH_BIT_STREAM_STRIDED)
+
+option(ZFP_WITH_TIGHT_ERROR "Reduce slack in absolute errors" OFF)
 
 option(ZFP_WITH_ALIGNED_ALLOC "Enable aligned memory allocation" OFF)
+mark_as_advanced(ZFP_WITH_ALIGNED_ALLOC)
 
 option(ZFP_WITH_CACHE_TWOWAY "Use two-way skew-associative cache" OFF)
+mark_as_advanced(ZFP_WITH_CACHE_TWOWAY)
 
 option(ZFP_WITH_CACHE_FAST_HASH
   "Use a faster but more collision prone hash function" OFF)
+mark_as_advanced(ZFP_WITH_CACHE_FAST_HASH)
 
 option(ZFP_WITH_CACHE_PROFILE "Count cache misses" OFF)
+mark_as_advanced(ZFP_WITH_CACHE_PROFILE)
 
 # Handle compile-time macros
 
@@ -134,19 +164,30 @@ if((DEFINED ZFP_UINT64) AND (DEFINED ZFP_UINT64_SUFFIX))
   list(APPEND zfp_public_defs ZFP_UINT64_SUFFIX=${ZFP_UINT64_SUFFIX})
 endif()
 
-# This odd cmake pattern here let's the OpenMP feature be either auto-detected,
+# This odd cmake pattern here lets the OpenMP feature be either auto-detected,
 # explicitly enabled, or explicitly disabled, instead of just on or off.
 if(DEFINED ZFP_WITH_OPENMP)
   option(ZFP_WITH_OPENMP "Enable OpenMP parallel compression"
     ${ZFP_WITH_OPENMP})
   if(ZFP_WITH_OPENMP)
-    find_package(OpenMP COMPONENTS C REQUIRED)
+    if(BUILD_EXAMPLES)
+      find_package(OpenMP COMPONENTS C CXX REQUIRED)
+    else()
+      find_package(OpenMP COMPONENTS C REQUIRED)
+    endif()
   endif()
 else()
-  find_package(OpenMP COMPONENTS C)
+  if(BUILD_EXAMPLES)
+    find_package(OpenMP COMPONENTS C CXX)
+  else()
+    find_package(OpenMP COMPONENTS C)
+  endif()
   option(ZFP_WITH_OPENMP "Enable OpenMP parallel compression" ${OPENMP_FOUND})
 endif()
 
+# Suppress CMake warning about unused variable in this file
+set(TOUCH_UNUSED_VARIABLE ${ZFP_OMP_TESTS_ONLY})
+
 # Some compilers don't use explicit libraries on the link line for OpenMP but
 # instead need to treat the OpenMP C flags as both compile and link flags
 # i.e. -fopenmp for compiling and -lgomp for linking, use -fomp for both
@@ -163,7 +204,7 @@ if(ZFP_WITH_CUDA)
     message(FATAL_ERROR "ZFP_WITH_CUDA is enabled, but a CUDA installation was not found.")
   endif()
   if(${CUDA_VERSION_MAJOR} LESS 7)
-        message(FATAL_ERROR "zfp requires at least CUDA 7.0.")
+    message(FATAL_ERROR "zfp requires at least CUDA 7.0.")
   endif()
 endif()
 
@@ -171,10 +212,31 @@ if(NOT (ZFP_BIT_STREAM_WORD_SIZE EQUAL 64))
   list(APPEND zfp_private_defs BIT_STREAM_WORD_TYPE=uint${ZFP_BIT_STREAM_WORD_SIZE})
 endif()
 
+if(DEFINED ZFP_CACHE_LINE_SIZE)
+  # Add to zfp_public_defs since many tests currently include files from src.
+#  list(APPEND zfp_public_defs ZFP_CACHE_LINE_SIZE=${ZFP_CACHE_LINE_SIZE})
+  list(APPEND zfp_private_defs ZFP_CACHE_LINE_SIZE=${ZFP_CACHE_LINE_SIZE})
+endif()
+
 if(ZFP_WITH_BIT_STREAM_STRIDED)
   list(APPEND zfp_public_defs BIT_STREAM_STRIDED)
 endif()
 
+if(NOT (ZFP_ROUNDING_MODE EQUAL ZFP_ROUND_NEVER))
+  list(APPEND zfp_private_defs ZFP_ROUNDING_MODE=${ZFP_ROUNDING_MODE})
+endif()
+
+if(ZFP_WITH_TIGHT_ERROR)
+  if((ZFP_ROUNDING_MODE EQUAL 0) OR (ZFP_ROUNDING_MODE STREQUAL ZFP_ROUND_NEVER))
+    message(FATAL_ERROR "ZFP_WITH_TIGHT_ERROR requires ZFP_ROUND_FIRST or ZFP_ROUND_LAST rounding mode")
+  endif()
+  list(APPEND zfp_private_defs ZFP_WITH_TIGHT_ERROR)
+endif()
+
+if(ZFP_WITH_DAZ)
+  list(APPEND zfp_private_defs ZFP_WITH_DAZ)
+endif()
+
 if(ZFP_WITH_ALIGNED_ALLOC)
   list(APPEND zfp_compressed_array_defs ZFP_WITH_ALIGNED_ALLOC)
 endif()
@@ -191,6 +253,8 @@ if(ZFP_WITH_CACHE_PROFILE)
   list(APPEND zfp_compressed_array_defs ZFP_WITH_CACHE_PROFILE)
 endif()
 
+list(APPEND ppm_private_defs PPM_CHROMA=${PPM_CHROMA})
+
 # Link libm only if necessary
 include(CheckCSourceCompiles)
 check_c_source_compiles("#include<math.h>\nfloat f; int main(){sqrt(f);return 0;}" HAVE_MATH)
@@ -252,17 +316,21 @@ if(BUILD_EXAMPLES)
 endif()
 
 if(BUILD_TESTING)
+  # Disable gtest install to prevent clobbering existing installations 
+  option(INSTALL_GMOCK "Install Googlemock" OFF)
+  option(INSTALL_GTEST "Install Googletest" OFF)
+
   add_subdirectory(tests)
 endif()
 
 #------------------------------------------------------------------------------#
 # Header install
 #------------------------------------------------------------------------------#
-install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-install(DIRECTORY array/   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
 if(BUILD_CFP)
-  install(DIRECTORY cfp/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+else()
+  install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+          PATTERN "cfp" EXCLUDE)
 endif()
 #------------------------------------------------------------------------------#
 # Build type: one of None, Debug, Release, RelWithDebInfo, MinSizeRel
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..9bc8fa83
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,13 @@
+Contributing
+============
+
+The zfp project uses the
+[Gitflow](https://nvie.com/posts/a-successful-git-branching-model/)
+development model.  Contributions should be made as pull requests on the
+`develop` branch.  Although this branch is under continuous development,
+it should be robust enough to pass all regression tests.  For contributions
+that are not production ready, please [contact us](mailto:zfp.llnl.gov) to
+have a separate branch created.  The `master` branch is updated with each
+release and reflects the most recent official release of zfp.  See the
+[Releases Page](https://github.com/LLNL/zfp/releases) for a history
+of releases.
diff --git a/CTestConfig.cmake b/CTestConfig.cmake
index cbb9abcc..6e1eb54f 100644
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@@ -11,3 +11,6 @@ set(CTEST_DROP_METHOD "https")
 set(CTEST_DROP_SITE "open.cdash.org")
 set(CTEST_DROP_LOCATION "/submit.php?project=zfp")
 set(CTEST_DROP_SITE_CDASH TRUE)
+
+# Test Options
+set(MEMORYCHECK_COMMAND_OPTIONS "--show-reachable=no")
diff --git a/Config b/Config
index 834bf252..cf0df65d 100644
--- a/Config
+++ b/Config
@@ -6,38 +6,54 @@ FC = gfortran
 
 # language standard -----------------------------------------------------------
 
-# CSTD = -std=c89 -Wno-unused-function
+# CSTD = -std=c89
   CSTD = -std=c99
   CXXSTD = -std=c++98
 # CXXSTD = -std=c++11
-  FSTD = -std=f2003 -ffree-form -Wno-c-binding-type
+  FSTD = -std=f2018 -ffree-form -Wno-c-binding-type
 
 # common compiler options -----------------------------------------------------
 
-FLAGS = -O3 -fPIC -Wall -Wextra -pedantic -I../include
+OPTFLAGS = -O3
+FLAGS = $(OPTFLAGS) -fPIC -pedantic -Wall -Wextra
+LDFLAGS =
 SOFLAGS =
 
-# macOS compiler options (uncomment on macOS) ---------------------------------
-
-# SOFLAGS += -undefined dynamic_lookup
-
 # OpenMP compiler options -----------------------------------------------------
 
-# do not uncomment; use "make ZFP_WITH_OPENMP=0" to disable OpenMP
+# do not comment out; use "make ZFP_WITH_OPENMP=0" to disable OpenMP
 OMPFLAGS = -fopenmp
 
-# optional compiler macros ----------------------------------------------------
+# Apple clang OpenMP options
+# OMPFLAGS = -Xclang -fopenmp
 
-# use long long for 64-bit types
-# DEFS += -DZFP_INT64='long long' -DZFP_INT64_SUFFIX='ll'
-# DEFS += -DZFP_UINT64='unsigned long long' -DZFP_UINT64_SUFFIX='ull'
+# optional compiler macros ----------------------------------------------------
 
-# use smaller bit stream word type for finer rate granularity
+# use smaller bit stream word type for finer rate granularity;
+# can bet set on command line, e.g., "make BIT_STREAM_WORD_TYPE=uint8"
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint8
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint16
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint32
 # DEFS += -DBIT_STREAM_WORD_TYPE=uint64
 
+# reduce bias and slack in errors; can be set on command line, e.g.,
+# "make ZFP_ROUNDING_MODE=ZFP_ROUND_FIRST"
+# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_NEVER
+# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_FIRST
+# DEFS += -DZFP_ROUNDING_MODE=ZFP_ROUND_LAST
+# DEFS += -DZFP_WITH_TIGHT_ERROR
+
+# treat subnormals as zero to avoid overflow; can be set on command line, e.g.,
+# "make ZFP_WITH_DAZ=1"
+# DEFS += -DZFP_WITH_DAZ
+
+# use long long for 64-bit types
+# DEFS += -DZFP_INT64='long long' -DZFP_INT64_SUFFIX='ll'
+# DEFS += -DZFP_UINT64='unsigned long long' -DZFP_UINT64_SUFFIX='ull'
+
+# cache alignment
+# DEFS += -DZFP_CACHE_LINE_SIZE=256
+
 # enable strided access for progressive zfp streams
 # DEFS += -DBIT_STREAM_STRIDED
 
@@ -85,7 +101,24 @@ else
   LIBCFP = libcfp.a
 endif
 
-# conditionals ----------------------------------------------------------------
+# operating system and compiler dependent flags -------------------------------
+
+# macOS configuration; compile with "make OS=mac"
+ifeq ($(OS),mac)
+  SOFLAGS += -undefined dynamic_lookup
+endif
+
+# suppress unused function warnings when compiling C89
+ifeq ($(CSTD),-std=c89)
+  FLAGS += -Wno-unused-function
+endif
+
+# process macros set on the command line --------------------------------------
+
+# bit stream word type
+ifdef BIT_STREAM_WORD_TYPE
+  DEFS += -DBIT_STREAM_WORD_TYPE=$(BIT_STREAM_WORD_TYPE)
+endif
 
 # enable OpenMP?
 ifdef ZFP_WITH_OPENMP
@@ -96,6 +129,31 @@ ifdef ZFP_WITH_OPENMP
   endif
 endif
 
+# treat subnormals as zero to avoid overflow
+ifdef ZFP_WITH_DAZ
+  ifneq ($(ZFP_WITH_DAZ),0)
+    FLAGS += -DZFP_WITH_DAZ
+  endif
+endif
+
+# rounding mode and slack in error
+ifdef ZFP_ROUNDING_MODE
+  FLAGS += -DZFP_ROUNDING_MODE=$(ZFP_ROUNDING_MODE)
+  ifneq ($(ZFP_ROUNDING_MODE),0)
+    # tight error bound requires round-first or round-last mode
+    ifdef ZFP_WITH_TIGHT_ERROR
+      ifneq ($(ZFP_WITH_TIGHT_ERROR),0)
+        FLAGS += -DZFP_WITH_TIGHT_ERROR
+      endif
+    endif
+  endif
+endif
+
+# chroma mode for ppm example
+ifdef PPM_CHROMA
+  PPM_FLAGS += -DPPM_CHROMA=$(PPM_CHROMA)
+endif
+
 # compiler options ------------------------------------------------------------
 
 CFLAGS = $(CSTD) $(FLAGS) $(DEFS)
diff --git a/LICENSE b/LICENSE
index 093449a3..9bed13fb 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,57 +1,29 @@
-Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-Produced at the Lawrence Livermore National Laboratory.
-Written by Peter Lindstrom, Markus Salasoo, Matt Larsen, and Stephen Herbein.
-LLNL-CODE-663824.
-All rights reserved.
+BSD 3-Clause License
 
-This file is part of the zfp library.
-For details, see http://computation.llnl.gov/casc/zfp/.
+Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC
+All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
-1. Redistributions of source code must retain the above copyright notice,
-this list of conditions and the disclaimer below.
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
 
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the disclaimer (as noted below) in the
-documentation and/or other materials provided with the distribution.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
 
-3. Neither the name of the LLNS/LLNL nor the names of its contributors may
-be used to endorse or promote products derived from this software without
-specific prior written permission.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED.  IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Additional BSD Notice
-
-1. This notice is required to be provided under our contract with the U.S.
-Department of Energy (DOE).  This work was produced at Lawrence Livermore
-National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-2. Neither the United States Government nor Lawrence Livermore National
-Security, LLC nor any of their employees, makes any warranty, express or
-implied, or assumes any liability or responsibility for the accuracy,
-completeness, or usefulness of any information, apparatus, product, or
-process disclosed, or represents that its use would not infringe
-privately-owned rights.
-
-3. Also, reference herein to any specific commercial products, process, or
-services by trade name, trademark, manufacturer or otherwise does not
-necessarily constitute or imply its endorsement, recommendation, or
-favoring by the United States Government or Lawrence Livermore National
-Security, LLC.  The views and opinions of authors expressed herein do not
-necessarily state or reflect those of the United States Government or
-Lawrence Livermore National Security, LLC, and shall not be used for
-advertising or product endorsement purposes.
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
index bddc72ae..aacf7789 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ all:
 	@echo $(LIBRARY)
 	@cd src; $(MAKE) clean $(LIBRARY)
 ifneq ($(BUILD_CFP),0)
-	@cd cfp/src; $(MAKE) clean $(LIBRARY)
+	@cd cfp; $(MAKE) clean $(LIBRARY)
 endif
 ifneq ($(BUILD_ZFORP),0)
 	@cd fortran; $(MAKE) clean $(LIBRARY)
@@ -33,7 +33,7 @@ test:
 # clean all
 clean:
 	@cd src; $(MAKE) clean
-	@cd cfp/src; $(MAKE) clean
+	@cd cfp; $(MAKE) clean
 	@cd fortran; $(MAKE) clean
 	@cd utils; $(MAKE) clean
 	@cd tests; $(MAKE) clean
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 00000000..3737d5a8
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,21 @@
+This work was produced under the auspices of the U.S. Department of
+Energy by Lawrence Livermore National Laboratory under Contract
+DE-AC52-07NA27344.
+
+This work was prepared as an account of work sponsored by an agency of
+the United States Government. Neither the United States Government nor
+Lawrence Livermore National Security, LLC, nor any of their employees
+makes any warranty, expressed or implied, or assumes any legal liability
+or responsibility for the accuracy, completeness, or usefulness of any
+information, apparatus, product, or process disclosed, or represents that
+its use would not infringe privately owned rights.
+
+Reference herein to any specific commercial product, process, or service
+by trade name, trademark, manufacturer, or otherwise does not necessarily
+constitute or imply its endorsement, recommendation, or favoring by the
+United States Government or Lawrence Livermore National Security, LLC.
+
+The views and opinions of authors expressed herein do not necessarily
+state or reflect those of the United States Government or Lawrence
+Livermore National Security, LLC, and shall not be used for advertising
+or product endorsement purposes.
diff --git a/README.md b/README.md
index 2fd5fa32..8f207f16 100644
--- a/README.md
+++ b/README.md
@@ -1,142 +1,114 @@
 ZFP
 ===
-[![Travis CI Build Status](https://travis-ci.org/LLNL/zfp.svg?branch=develop)](https://travis-ci.org/LLNL/zfp)
-[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/github/LLNL/zfp?branch=develop&svg=true)](https://ci.appveyor.com/project/salasoom/zfp)
-[![Documentation Status](https://readthedocs.org/projects/zfp/badge/?version=release0.5.5)](https://zfp.readthedocs.io/en/release0.5.5/?badge=release0.5.5)
-[![Codecov](https://codecov.io/gh/LLNL/zfp/branch/develop/graph/badge.svg)](https://codecov.io/gh/LLNL/zfp)
-
-INTRODUCTION
-------------
-
-zfp is an open source C/C++ library for compressed numerical arrays that
-support high throughput read and write random access.  zfp also supports
-streaming compression of integer and floating-point data, e.g., for
-applications that read and write large data sets to and from disk.
-zfp is primarily written in C and C++ but also includes Python and
-Fortran bindings.
-
-zfp was developed at Lawrence Livermore National Laboratory and is loosely
-based on the algorithm described in the following paper:
-
-    Peter Lindstrom
-    "Fixed-Rate Compressed Floating-Point Arrays"
-    IEEE Transactions on Visualization and Computer Graphics
-    20(12):2674-2683, December 2014
-    doi:10.1109/TVCG.2014.2346458
-
-zfp was originally designed for floating-point arrays only, but has been
-extended to also support integer data and could for instance be used to
-compress images and quantized volumetric data.  To achieve high compression
-ratios, zfp generally uses lossy but optionally error-bounded compression.
-Bit-for-bit lossless compression is also possible through one of zfp's
-compression modes.
-
-zfp works best for 2D and 3D arrays that exhibit spatial correlation, such as
-continuous fields from physics simulations, images, regularly sampled terrain
-surfaces, etc.  Although zfp also provides a 1D array class that can be used
-for 1D signals such as audio, or even unstructured floating-point streams,
-the compression scheme has not been well optimized for this use case, and
-rate and quality may not be competitive with floating-point compressors
-designed specifically for 1D streams.  zfp also supports compression of
-4D arrays.
-
-zfp is freely available as open source under a BSD license, as outlined in
-the file 'LICENSE'.  For more information on zfp and comparisons with other
-compressors, please see the
-[zfp website](https://computation.llnl.gov/projects/floating-point-compression).
-For bug reports, please consult the
-[GitHub issue tracker](https://github.com/LLNL/zfp/issues).
-For questions, comments, and requests, please contact
-[Peter Lindstrom](mailto:pl@llnl.gov).
-
-
-DOCUMENTATION
--------------
+[![Github Actions Build Status](https://github.com/LLNL/zfp/actions/workflows/main.yml/badge.svg?branch=release1.0.0)](https://github.com/LLNL/zfp/actions/workflows/main.yml)
+[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/qb3ld7j11segy52k/branch/release1.0.0?svg=true)](https://ci.appveyor.com/project/lindstro/zfp)
+[![Documentation Status](https://readthedocs.org/projects/zfp/badge/?version=release1.0.0)](https://zfp.readthedocs.io/en/release1.0.0/)
 
-Full
-[documentation](http://zfp.readthedocs.io/en/release0.5.5/)
-is available online via Read the Docs.  A
-[PDF](http://readthedocs.org/projects/zfp/downloads/pdf/release0.5.5/)
-version is also available.
+zfp is a compressed format for representing multidimensional floating-point
+and integer arrays.  zfp provides compressed-array classes that support high
+throughput read and write random access to individual array elements.  zfp
+also supports serial and parallel (OpenMP and CUDA) compression of whole
+arrays, e.g., for applications that read and write large data sets to and
+from disk.
 
+zfp uses lossy but optionally error-bounded compression to achieve high
+compression ratios.  Bit-for-bit lossless compression is also possible
+through one of zfp's compression modes.  zfp works best for 2D, 3D, and 4D
+arrays that exhibit spatial correlation, such as continuous fields from
+physics simulations, natural images, regularly sampled terrain surfaces, etc.
+zfp compression of 1D arrays is possible but generally discouraged.
 
-INSTALLATION
-------------
+zfp is freely available as open source and is distributed under a BSD license.
+zfp is primarily written in C and C++ but also includes Python and Fortran
+bindings.  zfp conforms to various language standards, including C89, C99,
+C11, C++98, C++11, and C++14, and is supported on Linux, macOS, and Windows.
 
-zfp consists of three distinct parts: a compression library written in C;
-a set of C++ header files with C wrappers that implement compressed arrays;
-and a set of C and C++ examples.  The main compression codec is written in
-C and should conform to both the ISO C89 and C99 standards.  The C++ array
-classes are implemented entirely in header files and can be included as is,
-but since they call the compression library, applications must link with
-libzfp.
 
-On Linux, macOS, and MinGW, zfp is easiest compiled using gcc and gmake.
-CMake support is also available, e.g., for Windows builds.  See below for
-instructions on GNU and CMake builds.
+Quick Start
+-----------
 
-zfp has successfully been built and tested using these compilers:
+To download zfp, type:
 
-    gcc versions 4.4.7, 4.9.4, 5.5.0, 6.1.0, 6.4.0, 7.1.0, 7.3.0, 8.1.0
-    icc versions 15.0.6, 16.0.4, 17.0.2, 18.0.2, 19.0.0
-    clang versions 3.9.1, 4.0.0, 5.0.0, 6.0.0 
-    MinGW version 5.3.0
-    Visual Studio versions 14 (2015), 15 (2017)
+    git clone https://github.com/LLNL/zfp.git
 
-zfp conforms to various language standards, including C89, C99, C11,
-C++98, C++11, and C++14.
+zfp may be built using either [CMake](https://cmake.org/) or
+[GNU make](https://www.gnu.org/software/make/).  To use CMake, type:
 
-NOTE: zfp requires 64-bit compiler and operating system support.
+    cd zfp
+    mkdir build
+    cd build
+    cmake ..
+    cmake --build . --config Release
+    ctest
 
-## GNU builds 
+This builds the zfp library in the `build/lib` directory and the zfp
+command-line executable in the `build/bin` directory.  It then runs
+the regression tests.
 
-To build zfp using gcc, type
+zfp may also be built using GNU make:
 
+    cd zfp
     make
+    make test
 
-from this directory.  This builds libzfp as a static library as well as
-utilities and example programs.  See documentation for complete build
-instructions.
+Note: GNU builds are less flexible and do not support all available features,
+e.g., CUDA support.
 
-## CMake builds
+For further configuration and build instructions, please consult the
+[documentation](https://zfp.readthedocs.io/en/release1.0.0/installation.html).
+For examples of how to call the C library and use the C++ array classes,
+see the [examples](https://zfp.readthedocs.io/en/release1.0.0/examples.html)
+section.
 
-To build zfp using CMake on Linux or macOS, start a Unix shell and type
 
-    mkdir build
-    cd build
-    cmake ..
-    make
+Documentation
+-------------
 
-To also build the examples, replace the cmake line with
+Full HTML [documentation](http://zfp.readthedocs.io/en/release1.0.0) is
+available online.
+A [PDF](http://readthedocs.org/projects/zfp/downloads/pdf/release1.0.0/)
+version is also available.
 
-    cmake -DBUILD_EXAMPLES=ON ..
+Further information on the zfp software is included in these files:
 
-To build zfp using Visual Studio on Windows, start a DOS shell, cd to the
-top-level zfp directory, and type
+- Change log: see [CHANGELOG.md](./CHANGELOG.md).
+- Support and additional resources: see [SUPPORT.md](./SUPPORT.md).
+- Code contributions: see [CONTRIBUTING.md](./CONTRIBUTING.md).
 
-    mkdir build
-    cd build
-    cmake ..
-    cmake --build . --config Release
 
-This builds zfp in release mode.  Replace 'Release' with 'Debug' to build
-zfp in debug mode.  See the instructions for Linux on how to change the
-cmake line to also build the example programs.
+Authors
+-------
 
-## Testing
+zfp was originally developed by [Peter Lindstrom](https://people.llnl.gov/pl)
+at [Lawrence Livermore National Laboratory](https://www.llnl.gov/).  Please
+see the [Contributors Page](https://github.com/LLNL/zfp/graphs/contributors)
+for a full list of contributors.
 
-To test that zfp is working properly, type
+### Citing zfp
 
-    make test
+If you use zfp for scholarly research, please cite this paper:
 
-or using CMake
+* Peter Lindstrom.
+  [Fixed-Rate Compressed Floating-Point Arrays](https://www.researchgate.net/publication/264417607_Fixed-Rate_Compressed_Floating-Point_Arrays).
+  IEEE Transactions on Visualization and Computer Graphics, 20(12):2674-2683, December 2014.
+  [doi:10.1109/TVCG.2014.2346458](http://doi.org/10.1109/TVCG.2014.2346458).
 
-    ctest
+The algorithm implemented in the current version of zfp is described in the
+[documentation](https://zfp.readthedocs.io/en/latest/algorithm.html) and in
+the following paper:
+
+* James Diffenderfer, Alyson Fox, Jeffrey Hittinger, Geoffrey Sanders, Peter Lindstrom.
+  [Error Analysis of ZFP Compression for Floating-Point Data](https://www.researchgate.net/publication/324908266_Error_Analysis_of_ZFP_Compression_for_Floating-Point_Data).
+  SIAM Journal on Scientific Computing, 41(3):A1867-A1898, June 2019.
+  [doi:10.1137/18M1168832](http://doi.org/10.1137/18M1168832).
+
+
+License
+-------
+
+zfp is distributed under the terms of the BSD 3-Clause license.  See
+[LICENSE](./LICENSE) and [NOTICE](./NOTICE) for details.
+
+SPDX-License-Identifier: BSD-3-Clause
 
-If the compilation or regression tests fail, it is possible that some of the
-macros in the file 'Config' have to be adjusted.  Also, the tests may fail
-due to minute differences in the computed floating-point fields being
-compressed, which will be indicated by checksum errors.  If most tests
-succeed and the failures result in byte sizes and error values reasonably
-close to the expected values, then it is likely that the compressor is
-working correctly.
+LLNL-CODE-663824
diff --git a/SUPPORT.md b/SUPPORT.md
new file mode 100644
index 00000000..83a97931
--- /dev/null
+++ b/SUPPORT.md
@@ -0,0 +1,11 @@
+Support
+=======
+
+For more information on zfp, please see the
+[zfp website](https://zfp.llnl.gov).
+For bug reports and feature requests, please consult the
+[GitHub issue tracker](https://github.com/LLNL/zfp/issues/).
+For questions and comments not answered here or in the
+[documentation](http://zfp.readthedocs.io),
+please contact us by email at
+[zfp@llnl.gov](mailto:zfp@llnl.gov).
diff --git a/VERSIONS.md b/VERSIONS.md
deleted file mode 100644
index 2b7e0e72..00000000
--- a/VERSIONS.md
+++ /dev/null
@@ -1,298 +0,0 @@
-# zfp Release Notes
-
-## 0.5.5 (May 5, 2019)
-
-- Added support for reversible (lossless) compression of floating-point and
-  integer data.
-
-- Added methods for serializing and deserializing zfp's compressed arrays.
-
-- Added Python bindings for compressing NumPy arrays.
-
-- Added Fortran bindings to zfp's high-level C API.
-
-- Change:
-  - The default compressed-array cache size is now a function of the total
-    number of array elements, irrespective of array shape.
-
-- Bug fixes:
-  - Incorrect handling of execution policy in zfp utility.
-  - Incorrect handling of decompression via header in zfp utility.
-  - Incorrect cleanup of device memory in CUDA decompress.
-  - Tests for failing mallocs.
-  - CMake installation of CFP when built.
-  - zfp\_write\_header and zfp\_field\_metadata now fail if array dimensions
-    are too large to fit in header.
-
-
-## 0.5.4 (October 1, 2018)
-
-- Added support for CUDA fixed-rate compression and decompression.
-
-- Added views into compressed arrays for thread safety, nested array
-  indexing, slicing, and array subsetting.
-
-- Added C language bindings for compressed arrays.
-
-- Added support for compressing and decompressing 4D data.
-
-- Changes:
-  - Execution policy now applies to both compression and decompression.
-  - Compressed array accessors now return Scalar type instead of
-    const Scalar& to avoid stale references to evicted cache lines.
-
-- Bug fixes:
-  - Handling of negative strides.
-  - Command line tool handling of arrays with more than 2^32 elements.
-  - bitstream C++ compatibility.  
-  - Respect minimum cache size request.
-
-
-## 0.5.3 (March 28, 2018)
-
-- Added support for OpenMP multithreaded compression (but not decompression).
-
-- Added options for OpenMP execution to zfp command-line tool.
-
-- Changed return value of zfp\_decompress to indicate the number of compressed
-  bytes processed so far (now returns same value as zfp\_compress on success).
-
-- Added compressed array support for copy construction and assignment via
-  deep copies.
-
-- Added virtual destructors to enable inheritance from zfp arrays.
-
-
-## 0.5.2 (September 28, 2017)
-
-- Added iterators and proxy objects for pointers and references.
-
-- Added example illustrating how to use iterators and pointers.
-
-- Modified diffusion example to optionally use iterators.
-
-- Moved internal headers under array to array/zfp.
-
-- Modified 64-bit integer typedefs to avoid the C89 non-compliant long long
-  and allow for user-supplied types and literal suffixes.
-
-- Renamed compile-time macros that did not have a ZFP prefix.
-
-- Fixed issue with setting stream word type via CMake.
-
-- Rewrote documentation in reStructuredText and added complete
-  documentation of all public functions, classes, types, and macros.
-  Removed ASCII documentation.
-
-
-## 0.5.1 (March 28, 2017)
-
-- This release primarily fixes a few minor issues but also includes
-  changes in anticipation of a large number of planned future additions
-  to the library.  No changes have been made to the compressed format,
-  which is backwards compatible with version 0.5.0.
-
-- Added high-level API support for integer types.
-
-- Separated library version from CODEC version and added version string.
-
-- Added example that illustrates in-place compression.
-
-- Added support for CMake builds.
-
-- Corrected inconsistent naming of BIT\_STREAM macros in code and
-  documentation.
-
-- Renamed some of the header bit mask macros.
-
-- Added return values to stream\_skip and stream\_flush to indicate the
-  number of bits skipped or output.
-
-- Renamed stream\_block and stream\_delta to make it clear that they refer
-  to strided streams.  Added missing definition of stream\_stride\_block.
-
-- Changed int/uint types in places to use ptrdiff\_t/size\_t where
-  appropriate.
-
-- Changed API for zfp\_set\_precision and zfp\_set\_accuracy to not require
-  the scalar type.
-
-- Added missing static keyword in decode\_block.
-
-- Changed testzfp to allow specifying which tests to perform on the
-  command line.
-
-- Fixed bug that prevented defining uninitialized arrays.
-
-- Fixed incorrect computation of array sizes in zfp\_field\_size.
-
-- Fixed minor issues that prevented code from compiling on Windows.
-
-- Fixed issue with fixed-accuracy headers that caused unnecessary storage.
-
-- Modified directory structure.
-
-- Added documentation that discusses common issues with using zfp.
-
-
-## 0.5.0 (February 29, 2016)
-
-- Modified CODEC to more efficiently encode blocks whose values are all
-  zero or are smaller in magnitude than the absolute error tolerance.
-  This allows representing "empty" blocks using only one bit each.  This
-  version is not backwards compatible with prior zfp versions.
-
-- Changed behavior of zfp\_compress and zfp\_decompress to not automatically
-  rewind the bit stream.  This makes it easier to concatenate multiple
-  compressed bit streams, e.g., when compressing vector fields or multiple
-  scalars together.
-
-- Added functions for compactly encoding the compression parameters
-  and field meta data, e.g., for producing self-contained compressed
-  streams.  Also added functions for reading and writing a header
-  containing these parameters.
-
-- Changed the zfp example program interface to allow reading and writing
-  compressed streams, optionally with a header.  The zfp tool can now be
-  used to compress and decompress files as a stand alone utility.
-
-
-## 0.4.1 (December 28, 2015)
-
-- Fixed bug that caused segmentation fault when compressing 3D arrays
-  whose dimensions are not multiples of four.  Specifically, arrays of
-  dimensions nx * ny * nz, with ny not a multiple of four, were not
-  handled correctly.
-
-- Modified examples/fields.h to ensure standard compliance.  Previously,
-  C99 support was needed to handle the hex float constants, which are
-  not supported in C++98.
-
-- Added simple.c as a minimal example of how to call the compressor.
-
-- Changed compilation of diffusion example to output two executables:
-  one with and one without compression.
-
-
-## 0.4.0 (December 5, 2015)
-
-- Substantial changes to the compression algorithm that improve PSNR
-  by about 6 dB and speed by a factor of 2-3.  These changes are not
-  backward compatible with previous versions of zfp.
-
-- Added support for 31-bit and 63-bit integer data, as well as shorter
-  integer types.
-
-- Rewrote compression codec entirely in C to make linking and calling
-  easier from other programming languages, and to expose the low-level
-  interface through C instead of C++.  This necessitated significant
-  changes to the API as well.
-
-- Minor changes to the C++ compressed array API, as well as major
-  implementation changes to support the C library.  The namespace and
-  public types are now all in lower case.
-
-- Deprecated support for general fixed-point decorrelating transforms
-  and slimmed down implementation.
-
-- Added new examples for evaluating the throughput of the (de)compressor
-  and for compressing grayscale images in the pgm format.
-
-- Added FAQ.
-
-
-## 0.3.2 (December 3, 2015)
-
-- Fixed bug in Array::get() that caused the wrong cached block to be
-  looked up, thus occasionally copying incorrect values back to parts
-  of the array.
-
-
-## 0.3.1 (May 6, 2015)
-
-- Fixed rare bug caused by exponent underflow in blocks with no normal
-  and some denormal numbers.
-
-
-## 0.3.0 (March 3, 2015)
-
-- Modified the default decorrelating transform to one that uses only
-  additions and bit shifts.  This new transform, in addition to being
-  faster, also has some theoretical optimality properties and tends to
-  improve rate distortion.
-
-- Added compile-time support for parameterized transforms, e.g., to
-  support other popular transforms like DCT, HCT, and Walsh-Hadamard.
-
-- Made forward transform range preserving: (-1, 1) is mapped to (-1, 1).
-  Consequently Q1.62 fixed point can be used throughout.
-
-- Changed the order in which bits are emitted within each bit plane
-  to be more intelligent.  Group tests are now deferred until they
-  are needed, i.e., just before the value bits for the group being
-  tested.  This improves the quality of fixed-rate encodings, but
-  has no impact on compressed size.
-
-- Made several optimizations to improve performance.
-
-- Added floating-point traits to reduce the number of template
-  parameters.  It is now possible to declare a 3D array as
-  Array3<float>, for example.
-
-- Added functions for setting the array scalar type and dimensions.
-
-- Consolidated several header files.
-
-- Added testzfp for regression testing.
-
-
-## 0.2.1 (December 12, 2014)
-
-- Added Win64 support via Microsoft Visual Studio compiler.
-
-- Fixed broken support for IBM's xlc compiler.
-
-- Made several minor changes to suppress compiler warnings.
-
-- Documented expected output for the diffusion example.
-
-
-## 0.2.0 (December 2, 2014)
-
-- The compression interface from zfpcompress was relocated to a
-  separate library, called libzfp, and modified to be callable from C.
-  This API now uses a parameter object (zfp\_params) to specify array
-  type and dimensions as well as compression parameters.
-
-- Several utility functions were added to simplify libzfp usage:
-
-  * Functions for setting the rate, precision, and accuracy.
-    Corresponding functions were also added to the Codec class.
-
-  * A function for estimating the buffer size needed for compression.
-
-- The Array class functionality was expanded:
-
-  * Support for accessing the compressed bit stream stored with an
-    array, e.g., for offline compressed storage and for initializing
-    an already compressed array.
-
-  * Functions for dynamically specifying the cache size.
-
-  * The default cache is now direct-mapped instead of two-way
-    associative.
-
-- Minor bug fixes:
-
-  * Corrected the value of the lowest possible bit plane to account for
-    both the smallest exponent and the number of bits in the significand.
-
-  * Corrected inconsistent use of rate and precision.  The rate refers
-    to the number of compressed bits per floating-point value, while
-    the precision refers to the number of uncompressed bits.  The Array
-    API was changed accordingly.
-
-
-## 0.1.0 (November 12, 2014)
-
-- Initial beta release.
diff --git a/appveyor.sh b/appveyor.sh
index 31d7194a..94ec4e33 100644
--- a/appveyor.sh
+++ b/appveyor.sh
@@ -18,7 +18,23 @@ BUILD_FLAGS="$BUILD_FLAGS -DBUILD_UTILITIES=ON"
 BUILD_FLAGS="$BUILD_FLAGS -DBUILD_EXAMPLES=ON"
 BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CFP=ON"
 BUILD_FLAGS="$BUILD_FLAGS -DCFP_NAMESPACE=cfp2"
-BUILD_FLAGS="$BUILD_FLAGS -DZFP_WITH_ALIGNED_ALLOC=ON"
+
+# zfpy only built for MSVC, Release builds
+if [ $COMPILER == "msvc" ] && [ $BUILD_TYPE == "Release" ]; then
+  # verify active python version matches what was specified in appveyor.yml
+
+  # fetch python version X.Y (single digits only)
+  ACTIVE_PY_VERSION=$(python -c 'import platform; print(platform.python_version())' | cut -c1-3)
+  # $PYTHON_VERSION comes from appveyor.yml and has form XY (no dot separating major and minor versions)
+  ACTIVE_PY_VERSION=${ACTIVE_PY_VERSION:0:1}${ACTIVE_PY_VERSION:2:1}
+
+  if [ $ACTIVE_PY_VERSION != $PYTHON_VERSION ]; then
+    exit 1
+  fi
+
+  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFPY=ON"
+fi
+
 BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=OFF"
 BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CUDA=OFF"
 
diff --git a/appveyor.yml b/appveyor.yml
index deea4b3c..abd9385a 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,74 +1,67 @@
-version: 0.5.5-{build}
+version: 1.0.0-{build}
 
 environment:
+  # zfpy only build for Release builds (otherwise need debug python libs python27_d.lib)
   matrix:
-    - COMPILER: mingw
-      GENERATOR: MinGW Makefiles
-      PLATFORM: Win32
-      BUILD_TYPE: Debug
-
-    - COMPILER: mingw
-      GENERATOR: MinGW Makefiles
-      PLATFORM: Win32
-      BUILD_TYPE: Release
-
-    - COMPILER: mingw-w64
-      GENERATOR: MinGW Makefiles
-      PLATFORM: x64
-      BUILD_TYPE: Debug
-
-    - COMPILER: mingw-w64
-      GENERATOR: MinGW Makefiles
-      PLATFORM: x64
-      BUILD_TYPE: Release
-
-    - COMPILER: msvc
-      GENERATOR: Visual Studio 15 2017 Win64
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      PLATFORM: x64
-      BUILD_TYPE: Debug
-
     - COMPILER: msvc
       GENERATOR: Visual Studio 15 2017 Win64
       APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
       PLATFORM: x64
       BUILD_TYPE: Release
-
-    - COMPILER: msvc
-      GENERATOR: Visual Studio 15 2017
-      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      PLATFORM: Win32
-      BUILD_TYPE: Debug
+      PYTHON_VERSION: 35
 
     - COMPILER: msvc
       GENERATOR: Visual Studio 15 2017
       APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
       PLATFORM: Win32
       BUILD_TYPE: Release
-
-    - COMPILER: msvc
-      GENERATOR: Visual Studio 14 2015 Win64
-      PLATFORM: x64
-      BUILD_TYPE: Debug
+      PYTHON_VERSION: 35
 
     - COMPILER: msvc
       GENERATOR: Visual Studio 14 2015 Win64
       PLATFORM: x64
       BUILD_TYPE: Release
+      PYTHON_VERSION: 35
 
     - COMPILER: msvc
       GENERATOR: Visual Studio 14 2015
       PLATFORM: Win32
-      BUILD_TYPE: Debug
+      BUILD_TYPE: Release
+      PYTHON_VERSION: 27
 
-    - COMPILER: msvc
-      GENERATOR: Visual Studio 14 2015
+    - COMPILER: mingw
+      GENERATOR: MinGW Makefiles
       PLATFORM: Win32
       BUILD_TYPE: Release
 
+    - COMPILER: mingw-w64
+      GENERATOR: MinGW Makefiles
+      PLATFORM: x64
+      BUILD_TYPE: Release
+
 install:
   - if "%COMPILER%"=="mingw" set PATH=C:\MinGW\bin;%PATH%
   - if "%COMPILER%"=="mingw-w64" set PATH=C:\MinGW\bin;%PATH%
 
+  # set env vars for Python system dir (assumed to always be MSVC)
+  - ps: |
+      if ($env:PYTHON_VERSION) {
+        $env:PYTHON_DIR = "C:\Python$env:PYTHON_VERSION"
+        if ($env:PLATFORM -eq "x64") {
+          $env:PYTHON_DIR = "$env:PYTHON_DIR-x64"
+        }
+
+        $env:PYTHON_LIB_PATH = "$env:PYTHON_DIR\libs\python$env:PYTHON_VERSION.lib"
+      }
+
+  # placing these behind a conditional for some reason prevents CMake from picking up the virtualenv
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" set PATH=%PYTHON_DIR%;%PYTHON_DIR%\Scripts;%PATH%
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" pip install virtualenv
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" set VIRTUALENV_NAME=pyVirtualEnv
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" virtualenv %VIRTUALENV_NAME%
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" "%VIRTUALENV_NAME%\\Scripts\\activate.bat"
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" pip install -r python\requirements.txt
+  - if "%COMPILER%"=="msvc" if "%BUILD_TYPE%"=="Release" python --version
+
 build_script:
   - sh appveyor.sh
diff --git a/array/zfp/header.h b/array/zfp/header.h
deleted file mode 100644
index ad6433cf..00000000
--- a/array/zfp/header.h
+++ /dev/null
@@ -1,19 +0,0 @@
-class header {
-public:
-  class exception : public std::runtime_error {
-  public:
-    exception(const std::string& msg) : runtime_error(msg) {}
-
-    virtual ~exception() throw (){}
-  };
-
-  static void concat_sentence(std::string& s, const std::string& msg)
-  {
-    if (!s.empty())
-      s += " ";
-    s += msg;
-  }
-
-  uchar buffer[BITS_TO_BYTES(ZFP_HEADER_SIZE_BITS)];
-};
-
diff --git a/array/zfp/headerHelpers.h b/array/zfp/headerHelpers.h
deleted file mode 100644
index ed33816a..00000000
--- a/array/zfp/headerHelpers.h
+++ /dev/null
@@ -1,144 +0,0 @@
-// "Handle" classes useful when throwing exceptions
-
-// buffer holds aligned memory for header, suitable for bitstream r/w (word-aligned)
-class AlignedBufferHandle {
-  public:
-    size_t buffer_size_bytes;
-    // uint64 alignment guarantees bitstream alignment
-    uint64* buffer;
-
-    // can copy a header into aligned buffer
-    AlignedBufferHandle(const zfp::array::header* h = 0) {
-      size_t num_64bit_entries = DIV_ROUND_UP(ZFP_HEADER_SIZE_BITS, CHAR_BIT * sizeof(uint64));
-      buffer = new uint64[num_64bit_entries];
-      buffer_size_bytes = num_64bit_entries * sizeof(uint64);
-
-      if (h)
-        memcpy(buffer, h->buffer, BITS_TO_BYTES(ZFP_HEADER_SIZE_BITS));
-    }
-
-    ~AlignedBufferHandle() {
-      delete[] buffer;
-    }
-
-    void copy_to_header(zfp::array::header* h) {
-      memcpy(h, buffer, BITS_TO_BYTES(ZFP_HEADER_SIZE_BITS));
-    }
-};
-
-// redirect zfp_stream->bitstream to header while object remains in scope
-class DualBitstreamHandle {
-  public:
-    bitstream* old_bs;
-    bitstream* new_bs;
-    zfp_stream* zfp;
-
-    DualBitstreamHandle(zfp_stream* zfp, AlignedBufferHandle& abh) :
-      zfp(zfp)
-    {
-      old_bs = zfp_stream_bit_stream(zfp);
-      new_bs = stream_open(abh.buffer, abh.buffer_size_bytes);
-
-      stream_rewind(new_bs);
-      zfp_stream_set_bit_stream(zfp, new_bs);
-    }
-
-    ~DualBitstreamHandle() {
-      zfp_stream_set_bit_stream(zfp, old_bs);
-      stream_close(new_bs);
-    }
-};
-
-class ZfpFieldHandle {
-  public:
-    zfp_field* field;
-
-    ZfpFieldHandle() {
-      field = zfp_field_alloc();
-    }
-
-    ZfpFieldHandle(zfp_type type, int nx, int ny, int nz) {
-      field = zfp_field_3d(0, type, nx, ny, nz);
-    }
-
-    ~ZfpFieldHandle() {
-      zfp_field_free(field);
-    }
-};
-
-class ZfpStreamHandle {
-  public:
-    bitstream* bs;
-    zfp_stream* stream;
-
-    ZfpStreamHandle(AlignedBufferHandle& abh) {
-      bs = stream_open(abh.buffer, abh.buffer_size_bytes);
-      stream = zfp_stream_open(bs);
-    }
-
-    ~ZfpStreamHandle() {
-      zfp_stream_close(stream);
-      stream_close(bs);
-    }
-};
-
-// verify buffer is large enough, with what header describes
-static bool is_valid_buffer_size(const zfp_stream* stream, uint nx, uint ny, uint nz, size_t expected_buffer_size_bytes)
-{
-  uint mx = ((std::max(nx, 1u)) + 3) / 4;
-  uint my = ((std::max(ny, 1u)) + 3) / 4;
-  uint mz = ((std::max(nz, 1u)) + 3) / 4;
-  size_t blocks = (size_t)mx * (size_t)my * (size_t)mz;
-  // no rounding because fixed-rate wra implies rate is multiple of word size
-  size_t described_buffer_size_bytes = blocks * stream->maxbits / CHAR_BIT;
-
-  return expected_buffer_size_bytes >= described_buffer_size_bytes;
-}
-
-static void read_header_contents(const zfp::array::header& header, size_t expected_buffer_size_bytes, uint& dims, zfp_type& type, double& rate, uint n[4])
-{
-  // create zfp_stream and zfp_field structs to call C API zfp_read_header()
-  AlignedBufferHandle abh;
-  memcpy(abh.buffer, header.buffer, BITS_TO_BYTES(ZFP_HEADER_SIZE_BITS));
-
-  ZfpStreamHandle zsh(abh);
-  ZfpFieldHandle zfh;
-
-  if (!zfp_read_header(zsh.stream, zfh.field, ZFP_HEADER_FULL))
-    throw zfp::array::header::exception("Invalid ZFP header.");
-
-  // gather metadata
-  dims = zfp_field_dimensionality(zfh.field);
-  type = zfp_field_type(zfh.field);
-
-  uint num_block_entries = 1u << (2 * dims);
-  rate = (double)zsh.stream->maxbits / num_block_entries;
-
-  zfp_field_size(zfh.field, n);
-
-  // validate header
-  std::string err_msg = "";
-  verify_header_contents(zsh.stream, zfh.field, err_msg);
-
-  if (!err_msg.empty())
-    throw zfp::array::header::exception(err_msg);
-
-  if (expected_buffer_size_bytes && !is_valid_buffer_size(zsh.stream, zfh.field->nx, zfh.field->ny, zfh.field->nz, expected_buffer_size_bytes))
-    throw zfp::array::header::exception("ZFP header expects a longer buffer than what was passed in.");
-}
-
-// verifies metadata on zfp_stream and zfp_field describe a valid compressed array
-static void verify_header_contents(const zfp_stream* stream, const zfp_field* field, std::string& err_msg)
-{
-  // verify read-header contents
-  zfp_type type = zfp_field_type(field);
-  if (type != zfp_type_float && type != zfp_type_double)
-    zfp::array::header::concat_sentence(err_msg, "ZFP compressed arrays do not yet support scalar types beyond floats and doubles.");
-
-  uint dims = zfp_field_dimensionality(field);
-  if (dims < 1 || dims > 3)
-    zfp::array::header::concat_sentence(err_msg, "ZFP compressed arrays do not yet support dimensionalities beyond 1, 2, and 3.");
-
-  if (zfp_stream_compression_mode(stream) != zfp_mode_fixed_rate)
-    zfp::array::header::concat_sentence(err_msg, "ZFP header specified a non fixed-rate mode, unsupported by this object.");
-}
diff --git a/array/zfp/iterator1.h b/array/zfp/iterator1.h
deleted file mode 100644
index 310e8e2d..00000000
--- a/array/zfp/iterator1.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// random access iterator that visits 1D array block by block; this class is nested within zfp::array1
-class iterator {
-public:
-  // typedefs for STL compatibility
-  typedef Scalar value_type;
-  typedef ptrdiff_t difference_type;
-  typedef typename array1::reference reference;
-  typedef typename array1::pointer pointer;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  iterator() : ref(0, 0) {}
-  iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; return *this; }
-  reference operator*() const { return ref; }
-  reference operator[](difference_type d) const { return *operator+(d); }
-  iterator& operator++() { increment(); return *this; }
-  iterator& operator--() { decrement(); return *this; }
-  iterator operator++(int) { iterator it = *this; increment(); return it; }
-  iterator operator--(int) { iterator it = *this; decrement(); return it; }
-  iterator operator+=(difference_type d) { ref.i += d; return *this; }
-  iterator operator-=(difference_type d) { ref.i -= d; return *this; }
-  iterator operator+(difference_type d) const { return iterator(ref.array, ref.i + d); }
-  iterator operator-(difference_type d) const { return iterator(ref.array, ref.i - d); }
-  difference_type operator-(const iterator& it) const { return static_cast<difference_type>(ref.i) - static_cast<difference_type>(it.ref.i); }
-  bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i; }
-  bool operator!=(const iterator& it) const { return !operator==(it); }
-  bool operator<=(const iterator& it) const { return ref.array == it.ref.array && ref.i <= it.ref.i; }
-  bool operator>=(const iterator& it) const { return ref.array == it.ref.array && ref.i >= it.ref.i; }
-  bool operator<(const iterator& it) const { return !operator>=(it); }
-  bool operator>(const iterator& it) const { return !operator<=(it); }
-  uint i() const { return ref.i; }
-
-protected:
-  friend class array1;
-  explicit iterator(array1* array, uint i) : ref(array, i) {}
-  void increment() { ref.i++; }
-  void decrement() { ref.i--; }
-  reference ref;
-};
diff --git a/array/zfp/iterator2.h b/array/zfp/iterator2.h
deleted file mode 100644
index 03052c4e..00000000
--- a/array/zfp/iterator2.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// forward iterator that visits 2D array block by block; this class is nested within zfp::array2
-class iterator {
-public:
-  // typedefs for STL compatibility
-  typedef Scalar value_type;
-  typedef ptrdiff_t difference_type;
-  typedef typename array2::reference reference;
-  typedef typename array2::pointer pointer;
-  typedef std::forward_iterator_tag iterator_category;
-
-  iterator() : ref(0, 0, 0) {}
-  iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; return *this; }
-  reference operator*() const { return ref; }
-  iterator& operator++() { increment(); return *this; }
-  iterator operator++(int) { iterator it = *this; increment(); return it; }
-  bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j; }
-  bool operator!=(const iterator& it) const { return !operator==(it); }
-  uint i() const { return ref.i; }
-  uint j() const { return ref.j; }
-
-protected:
-  friend class array2;
-  explicit iterator(array2* array, uint i, uint j) : ref(array, i, j) {}
-  void increment()
-  {
-    ref.i++;
-    if (!(ref.i & 3u) || ref.i == ref.array->nx) {
-      ref.i = (ref.i - 1) & ~3u;
-      ref.j++;
-      if (!(ref.j & 3u) || ref.j == ref.array->ny) {
-        ref.j = (ref.j - 1) & ~3u;
-        // done with block; advance to next
-        if ((ref.i += 4) >= ref.array->nx) {
-          ref.i = 0;
-          if ((ref.j += 4) >= ref.array->ny)
-            ref.j = ref.array->ny;
-        }
-      }
-    }
-  }
-  reference ref;
-};
diff --git a/array/zfp/iterator3.h b/array/zfp/iterator3.h
deleted file mode 100644
index 3889fc1c..00000000
--- a/array/zfp/iterator3.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// forward iterator that visits 3D array block by block; this class is nested within zfp::array3
-class iterator {
-public:
-  // typedefs for STL compatibility
-  typedef Scalar value_type;
-  typedef ptrdiff_t difference_type;
-  typedef typename array3::reference reference;
-  typedef typename array3::pointer pointer;
-  typedef std::forward_iterator_tag iterator_category;
-
-  iterator() : ref(0, 0, 0, 0) {}
-  iterator operator=(const iterator& it) { ref.array = it.ref.array; ref.i = it.ref.i; ref.j = it.ref.j; ref.k = it.ref.k; return *this; }
-  reference operator*() const { return ref; }
-  iterator& operator++() { increment(); return *this; }
-  iterator operator++(int) { iterator it = *this; increment(); return it; }
-  bool operator==(const iterator& it) const { return ref.array == it.ref.array && ref.i == it.ref.i && ref.j == it.ref.j && ref.k == it.ref.k; }
-  bool operator!=(const iterator& it) const { return !operator==(it); }
-  uint i() const { return ref.i; }
-  uint j() const { return ref.j; }
-  uint k() const { return ref.k; }
-
-protected:
-  friend class array3;
-  explicit iterator(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {}
-  void increment()
-  {
-    ref.i++;
-    if (!(ref.i & 3u) || ref.i == ref.array->nx) {
-      ref.i = (ref.i - 1) & ~3u;
-      ref.j++;
-      if (!(ref.j & 3u) || ref.j == ref.array->ny) {
-        ref.j = (ref.j - 1) & ~3u;
-        ref.k++;
-        if (!(ref.k & 3u) || ref.k == ref.array->nz) {
-          ref.k = (ref.k - 1) & ~3u;
-          // done with block; advance to next
-          if ((ref.i += 4) >= ref.array->nx) {
-            ref.i = 0;
-            if ((ref.j += 4) >= ref.array->ny) {
-              ref.j = 0;
-              if ((ref.k += 4) >= ref.array->nz)
-                ref.k = ref.array->nz;
-            }
-          }
-        }
-      }
-    }
-  }
-  reference ref;
-};
diff --git a/array/zfp/memory.h b/array/zfp/memory.h
deleted file mode 100644
index 852559da..00000000
--- a/array/zfp/memory.h
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef ZFP_MEMORY_H
-#define ZFP_MEMORY_H
-
-#ifdef _WIN32
-extern "C" {
-  #ifdef __MINGW32__
-  #include <x86intrin.h>
-  #endif
-
-  #include <malloc.h>
-}
-#endif
-
-#include <algorithm>
-#include <cstdlib>
-#include "zfp/types.h"
-
-#define unused_(x) ((void)(x))
-
-namespace zfp {
-
-// allocate size bytes
-inline void*
-allocate(size_t size)
-{
-  return new uchar[size];
-}
-
-// allocate size bytes with alignment
-inline void*
-allocate_aligned(size_t size, size_t alignment)
-{
-  void* ptr;
-  bool is_mem_failed = false;
-
-#ifdef ZFP_WITH_ALIGNED_ALLOC
-  #ifdef __INTEL_COMPILER
-  ptr = _mm_malloc(size, alignment);
-
-  #elif defined(__MINGW32__)
-  ptr = __mingw_aligned_malloc(size, alignment);
-
-  #elif defined(_WIN32)
-  ptr = _aligned_malloc(size, alignment);
-
-  #elif (_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600) || defined(__MACH__)
-  is_mem_failed = posix_memalign(&ptr, alignment, size);
-
-  #else
-  unused_(alignment);
-  ptr = malloc(size);
-
-  #endif
-
-#else
-  unused_(alignment);
-  ptr = malloc(size);
-
-#endif
-
-  if (is_mem_failed || (ptr == NULL))
-    throw std::bad_alloc();
-
-  return ptr;
-}
-
-// deallocate memory pointed to by ptr
-template <typename T>
-inline void
-deallocate(T* ptr)
-{
-  delete[] ptr;
-}
-
-template <typename T>
-inline void
-deallocate_aligned(T* ptr)
-{
-#ifdef ZFP_WITH_ALIGNED_ALLOC
-  if (ptr)
-  #ifdef __INTEL_COMPILER
-    _mm_free(ptr);
-  #elif defined(__MINGW32__)
-    __mingw_aligned_free(ptr);
-  #elif defined(_WIN32)
-    _aligned_free(ptr);
-  #else
-    free(ptr);
-  #endif
-
-#else
-  if (ptr)
-    free(ptr);
-#endif
-}
-
-// reallocate size bytes
-template <typename T>
-inline void
-reallocate(T*& ptr, size_t size)
-{
-  zfp::deallocate(ptr);
-  ptr = static_cast<T*>(zfp::allocate(size));
-}
-
-template <typename T>
-inline void
-reallocate_aligned(T*& ptr, size_t size, size_t alignment)
-{
-  zfp::deallocate_aligned(ptr);
-  ptr = static_cast<T*>(zfp::allocate_aligned(size, alignment));
-}
-
-// clone array 'T src[count]'
-template <typename T>
-inline void
-clone(T*& dst, const T* src, size_t count)
-{
-  zfp::deallocate(dst);
-  if (src) {
-    dst = static_cast<T*>(zfp::allocate(count * sizeof(T)));
-    std::copy(src, src + count, dst);
-  }
-  else
-    dst = 0;
-}
-
-template <typename T>
-inline void
-clone_aligned(T*& dst, const T* src, size_t count, size_t alignment)
-{
-  zfp::deallocate_aligned(dst);
-  if (src) {
-    dst = static_cast<T*>(zfp::allocate_aligned(count * sizeof(T), alignment));
-    std::copy(src, src + count, dst);
-  }
-  else
-    dst = 0;
-}
-
-}
-
-#undef unused_
-
-#endif
diff --git a/array/zfp/pointer1.h b/array/zfp/pointer1.h
deleted file mode 100644
index f58557c0..00000000
--- a/array/zfp/pointer1.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// pointer to a 1D array element; this class is nested within zfp::array1
-class pointer {
-public:
-  pointer() : ref(0, 0) {}
-  pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; return *this; }
-  reference operator*() const { return ref; }
-  reference operator[](ptrdiff_t d) const { return *operator+(d); }
-  pointer& operator++() { increment(); return *this; }
-  pointer& operator--() { decrement(); return *this; }
-  pointer operator++(int) { pointer p = *this; increment(); return p; }
-  pointer operator--(int) { pointer p = *this; decrement(); return p; }
-  pointer operator+=(ptrdiff_t d) { ref.i += d; return *this; }
-  pointer operator-=(ptrdiff_t d) { ref.i -= d; return *this; }
-  pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
-  pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
-  ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
-  bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i; }
-  bool operator!=(const pointer& p) const { return !operator==(p); }
-
-protected:
-  friend class array1;
-  friend class reference;
-  explicit pointer(reference r) : ref(r) {}
-  explicit pointer(array1* array, uint i) : ref(array, i) {}
-  ptrdiff_t index() const { return ref.i; }
-  void set(ptrdiff_t index) { ref.i = index; }
-  void increment() { ref.i++; }
-  void decrement() { ref.i--; }
-  reference ref;
-};
diff --git a/array/zfp/pointer2.h b/array/zfp/pointer2.h
deleted file mode 100644
index dcdb518f..00000000
--- a/array/zfp/pointer2.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// pointer to a 2D array element; this class is nested within zfp::array2
-class pointer {
-public:
-  pointer() : ref(0, 0, 0) {}
-  pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; return *this; }
-  reference operator*() const { return ref; }
-  reference operator[](ptrdiff_t d) const { return *operator+(d); }
-  pointer& operator++() { increment(); return *this; }
-  pointer& operator--() { decrement(); return *this; }
-  pointer operator++(int) { pointer p = *this; increment(); return p; }
-  pointer operator--(int) { pointer p = *this; decrement(); return p; }
-  pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; }
-  pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; }
-  pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
-  pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
-  ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
-  bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j; }
-  bool operator!=(const pointer& p) const { return !operator==(p); }
-
-protected:
-  friend class array2;
-  friend class reference;
-  explicit pointer(reference r) : ref(r) {}
-  explicit pointer(array2* array, uint i, uint j) : ref(array, i, j) {}
-  ptrdiff_t index() const { return ref.i + ref.array->nx * ref.j; }
-  void set(ptrdiff_t index) { ref.array->ij(ref.i, ref.j, index); }
-  void increment()
-  {
-    if (++ref.i == ref.array->nx) {
-      ref.i = 0;
-      ref.j++;
-    }
-  }
-  void decrement()
-  {
-    if (!ref.i--) {
-      ref.i = ref.array->nx - 1;
-      ref.j--;
-    }
-  }
-  reference ref;
-};
diff --git a/array/zfp/pointer3.h b/array/zfp/pointer3.h
deleted file mode 100644
index 091af604..00000000
--- a/array/zfp/pointer3.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// pointer to a 3D array element; this class is nested within zfp::array3
-class pointer {
-public:
-  pointer() : ref(0, 0, 0, 0) {}
-  pointer operator=(const pointer& p) { ref.array = p.ref.array; ref.i = p.ref.i; ref.j = p.ref.j; ref.k = p.ref.k; return *this; }
-  reference operator*() const { return ref; }
-  reference operator[](ptrdiff_t d) const { return *operator+(d); }
-  pointer& operator++() { increment(); return *this; }
-  pointer& operator--() { decrement(); return *this; }
-  pointer operator++(int) { pointer p = *this; increment(); return p; }
-  pointer operator--(int) { pointer p = *this; decrement(); return p; }
-  pointer operator+=(ptrdiff_t d) { set(index() + d); return *this; }
-  pointer operator-=(ptrdiff_t d) { set(index() - d); return *this; }
-  pointer operator+(ptrdiff_t d) const { pointer p = *this; p += d; return p; }
-  pointer operator-(ptrdiff_t d) const { pointer p = *this; p -= d; return p; }
-  ptrdiff_t operator-(const pointer& p) const { return index() - p.index(); }
-  bool operator==(const pointer& p) const { return ref.array == p.ref.array && ref.i == p.ref.i && ref.j == p.ref.j && ref.k == p.ref.k; }
-  bool operator!=(const pointer& p) const { return !operator==(p); }
-
-protected:
-  friend class array3;
-  friend class reference;
-  explicit pointer(reference r) : ref(r) {}
-  explicit pointer(array3* array, uint i, uint j, uint k) : ref(array, i, j, k) {}
-  ptrdiff_t index() const { return ref.i + ref.array->nx * (ref.j + ref.array->ny * ref.k); }
-  void set(ptrdiff_t index) { ref.array->ijk(ref.i, ref.j, ref.k, index); }
-  void increment()
-  {
-    if (++ref.i == ref.array->nx) {
-      ref.i = 0;
-      if (++ref.j == ref.array->ny) {
-        ref.j = 0;
-        ref.k++;
-      }
-    }
-  }
-  void decrement()
-  {
-    if (!ref.i--) {
-      ref.i = ref.array->nx - 1;
-      if (!ref.j--) {
-        ref.j = ref.array->ny - 1;
-        ref.k--;
-      }
-    }
-  }
-  reference ref;
-};
diff --git a/array/zfp/reference1.h b/array/zfp/reference1.h
deleted file mode 100644
index 99f2e6a6..00000000
--- a/array/zfp/reference1.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// reference to a 1D array element; this class is nested within zfp::array1
-class reference {
-public:
-  operator Scalar() const { return array->get(i); }
-  reference operator=(const reference& r) { array->set(i, r.operator Scalar()); return *this; }
-  reference operator=(Scalar val) { array->set(i, val); return *this; }
-  reference operator+=(Scalar val) { array->add(i, val); return *this; }
-  reference operator-=(Scalar val) { array->sub(i, val); return *this; }
-  reference operator*=(Scalar val) { array->mul(i, val); return *this; }
-  reference operator/=(Scalar val) { array->div(i, val); return *this; }
-  pointer operator&() const { return pointer(*this); }
-  // swap two array elements via proxy references
-  friend void swap(reference a, reference b)
-  {
-    Scalar x = a.operator Scalar();
-    Scalar y = b.operator Scalar();
-    b.operator=(x);
-    a.operator=(y);
-  }
-
-protected:
-  friend class array1;
-  friend class iterator;
-  explicit reference(array1* array, uint i) : array(array), i(i) {}
-  array1* array;
-  uint i;
-};
diff --git a/array/zfp/reference2.h b/array/zfp/reference2.h
deleted file mode 100644
index 76a0bd3b..00000000
--- a/array/zfp/reference2.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// reference to a 2D array element; this class is nested within zfp::array2
-class reference {
-public:
-  operator Scalar() const { return array->get(i, j); }
-  reference operator=(const reference& r) { array->set(i, j, r.operator Scalar()); return *this; }
-  reference operator=(Scalar val) { array->set(i, j, val); return *this; }
-  reference operator+=(Scalar val) { array->add(i, j, val); return *this; }
-  reference operator-=(Scalar val) { array->sub(i, j, val); return *this; }
-  reference operator*=(Scalar val) { array->mul(i, j, val); return *this; }
-  reference operator/=(Scalar val) { array->div(i, j, val); return *this; }
-  pointer operator&() const { return pointer(*this); }
-  // swap two array elements via proxy references
-  friend void swap(reference a, reference b)
-  {
-    Scalar x = a.operator Scalar();
-    Scalar y = b.operator Scalar();
-    b.operator=(x);
-    a.operator=(y);
-  }
-
-protected:
-  friend class array2;
-  friend class iterator;
-  explicit reference(array2* array, uint i, uint j) : array(array), i(i), j(j) {}
-  array2* array;
-  uint i, j;
-};
diff --git a/array/zfp/reference3.h b/array/zfp/reference3.h
deleted file mode 100644
index 91175e18..00000000
--- a/array/zfp/reference3.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// reference to a 3D array element; this class is nested within zfp::array3
-class reference {
-public:
-  operator Scalar() const { return array->get(i, j, k); }
-  reference operator=(const reference& r) { array->set(i, j, k, r.operator Scalar()); return *this; }
-  reference operator=(Scalar val) { array->set(i, j, k, val); return *this; }
-  reference operator+=(Scalar val) { array->add(i, j, k, val); return *this; }
-  reference operator-=(Scalar val) { array->sub(i, j, k, val); return *this; }
-  reference operator*=(Scalar val) { array->mul(i, j, k, val); return *this; }
-  reference operator/=(Scalar val) { array->div(i, j, k, val); return *this; }
-  pointer operator&() const { return pointer(*this); }
-  // swap two array elements via proxy references
-  friend void swap(reference a, reference b)
-  {
-    Scalar x = a.operator Scalar();
-    Scalar y = b.operator Scalar();
-    b.operator=(x);
-    a.operator=(y);
-  }
-
-protected:
-  friend class array3;
-  friend class iterator;
-  explicit reference(array3* array, uint i, uint j, uint k) : array(array), i(i), j(j), k(k) {}
-  array3* array;
-  uint i, j, k;
-};
diff --git a/array/zfp/view1.h b/array/zfp/view1.h
deleted file mode 100644
index 6129ae5e..00000000
--- a/array/zfp/view1.h
+++ /dev/null
@@ -1,291 +0,0 @@
-// 1D array views; these classes are nested within zfp::array1
-
-// abstract view of 1D array (base class)
-class preview {
-public:
-  // rate in bits per value
-  double rate() const { return array->rate(); }
-
-  // dimensions of (sub)array
-  size_t size() const { return size_t(nx); }
-
-  // local to global array index
-  uint global_x(uint i) const { return x + i; }
-
-protected:
-  // construction and assignment--perform shallow copy of (sub)array
-  explicit preview(array1* array) : array(array), x(0), nx(array->nx) {}
-  explicit preview(array1* array, uint x, uint nx) : array(array), x(x), nx(nx) {}
-  preview& operator=(array1* a)
-  {
-    array = a;
-    x = 0;
-    nx = a->nx;
-    return *this;
-  }
-
-  array1* array; // underlying container
-  uint x;        // offset into array
-  uint nx;       // dimensions of subarray
-};
-
-// generic read-only view into a rectangular subset of a 1D array
-class const_view : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::nx;
-public:
-  // construction--perform shallow copy of (sub)array
-  const_view(array1* array) : preview(array) {}
-  const_view(array1* array, uint x, uint nx) : preview(array, x, nx) {}
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-
-  // [i] accessor
-  Scalar operator[](uint index) const { return array->get(x + index); }
-
-  // (i) accessor
-  Scalar operator()(uint i) const { return array->get(x + i); }
-};
-
-// generic read-write view into a rectangular subset of a 1D array
-class view : public const_view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::nx;
-public:
-  // construction--perform shallow copy of (sub)array
-  view(array1* array) : const_view(array) {}
-  view(array1* array, uint x, uint nx) : const_view(array, x, nx) {}
-
-  // [i] accessor from base class
-  using const_view::operator[];
-
-  // (i) accessor from base class
-  using const_view::operator();
-
-  // [i] mutator
-  reference operator[](uint index) { return reference(array, x + index); }
-
-  // (i) mutator
-  reference operator()(uint i) { return reference(array, x + i); }
-};
-
-// thread-safe read-only view of 1D (sub)array with private cache
-class private_const_view : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::nx;
-public:
-  // construction--perform shallow copy of (sub)array
-  private_const_view(array1* array) :
-    preview(array),
-    cache(array->cache.size())
-  {
-    init();
-  }
-  private_const_view(array1* array, uint x, uint nx) :
-    preview(array, x, nx),
-    cache(array->cache.size())
-  {
-    init();
-  }
-
-  // destructor
-  ~private_const_view()
-  {
-    stream_close(zfp->stream);
-    zfp_stream_close(zfp);
-  }
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-
-  // cache size in number of bytes
-  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
-
-  // set minimum cache size in bytes (array dimensions must be known)
-  void set_cache_size(size_t csize)
-  {
-    cache.resize(array->lines(csize, nx));
-  }
-
-  // empty cache without compressing modified cached blocks
-  void clear_cache() const { cache.clear(); }
-
-  // (i) accessor
-  Scalar operator()(uint i) const { return get(x + i); }
-
-protected:
-  // cache line representing one block of decompressed values
-  class CacheLine {
-  public:
-    const Scalar& operator()(uint i) const { return a[index(i)]; }
-    Scalar& operator()(uint i) { return a[index(i)]; }
-    const Scalar* data() const { return a; }
-    Scalar* data() { return a; }
-  protected:
-    static uint index(uint i) { return i & 3u; }
-    Scalar a[4];
-  };
-
-  // copy private data
-  void init()
-  {
-    // copy compressed stream
-    zfp = zfp_stream_open(0);
-    *zfp = *array->zfp;
-    // copy bit stream
-    zfp->stream = stream_clone(array->zfp->stream);
-  }
-
-  // inspector
-  const Scalar& get(uint i) const
-  {
-    const CacheLine* p = line(i);
-    return (*p)(i);
-  }
-
-  // return cache line for i; may require write-back and fetch
-  CacheLine* line(uint i) const
-  {
-    CacheLine* p = 0;
-    uint b = array->block(i);
-    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false);
-    uint c = t.index() - 1;
-    // fetch cache line; no writeback possible since view is read-only
-    if (c != b)
-      decode(b, p->data());
-    return p;
-  }
-
-  // decode block with given index
-  void decode(uint index, Scalar* block) const
-  {
-    stream_rseek(zfp->stream, index * array->blkbits);
-    Codec::decode_block_1(zfp, block, array->shape ? array->shape[index] : 0);
-  }
-
-  zfp_stream* zfp;                // stream of compressed blocks
-  mutable Cache<CacheLine> cache; // cache of decompressed blocks
-};
-
-// thread-safe read-write view of private 1D (sub)array
-class private_view : public private_const_view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::nx;
-  using private_const_view::zfp;
-  using private_const_view::cache;
-  using private_const_view::init;
-  using private_const_view::decode;
-  class view_reference;
-  typedef typename private_const_view::CacheLine CacheLine;
-public:
-  // construction--perform shallow copy of (sub)array
-  private_view(array1* array) : private_const_view(array) {}
-  private_view(array1* array, uint x, uint nx) : private_const_view(array, x, nx) {}
-
-  // partition view into count block-aligned pieces, with 0 <= index < count
-  void partition(uint index, uint count)
-  {
-    partition(x, nx, index, count);
-  }
-
-  // flush cache by compressing all modified cached blocks
-  void flush_cache() const
-  {
-    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
-      if (p->tag.dirty()) {
-        uint b = p->tag.index() - 1;
-        encode(b, p->line->data());
-      }
-      cache.flush(p->line);
-    }
-  }
-
-  // (i) accessor from base class
-  using private_const_view::operator();
-
-  // (i) mutator
-  view_reference operator()(uint i) { return view_reference(this, x + i); }
-
-protected:
-  class view_reference {
-  public:
-    operator Scalar() const { return view->get(i); }
-    view_reference operator=(const view_reference& r) { view->set(i, r.operator Scalar()); return *this; }
-    view_reference operator=(Scalar val) { view->set(i, val); return *this; }
-    view_reference operator+=(Scalar val) { view->add(i, val); return *this; }
-    view_reference operator-=(Scalar val) { view->sub(i, val); return *this; }
-    view_reference operator*=(Scalar val) { view->mul(i, val); return *this; }
-    view_reference operator/=(Scalar val) { view->div(i, val); return *this; }
-    // swap two array elements via proxy references
-    friend void swap(view_reference a, view_reference b)
-    {
-      Scalar x = a.operator Scalar();
-      Scalar y = b.operator Scalar();
-      b.operator=(x);
-      a.operator=(y);
-    }
-
-  protected:
-    friend class private_view;
-    explicit view_reference(private_view* view, uint i) : view(view), i(i) {}
-    private_view* view;
-    uint i;
-  };
-
-  // block-aligned partition of [offset, offset + size): index out of count
-  static void partition(uint& offset, uint& size, uint index, uint count)
-  {
-    uint bmin = offset / 4;
-    uint bmax = (offset + size + 3) / 4;
-    uint xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
-    uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
-    offset = xmin;
-    size = xmax - xmin;
-  }
-
-  // mutator
-  void set(uint i, Scalar val)
-  {
-    CacheLine* p = line(i, true);
-    (*p)(i) = val;
-  }
-
-  // in-place updates
-  void add(uint i, Scalar val) { (*line(i, true))(i) += val; }
-  void sub(uint i, Scalar val) { (*line(i, true))(i) -= val; }
-  void mul(uint i, Scalar val) { (*line(i, true))(i) *= val; }
-  void div(uint i, Scalar val) { (*line(i, true))(i) /= val; }
-
-  // return cache line for i; may require write-back and fetch
-  CacheLine* line(uint i, bool write) const
-  {
-    CacheLine* p = 0;
-    uint b = array->block(i);
-    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
-    uint c = t.index() - 1;
-    if (c != b) {
-      // write back occupied cache line if it is dirty
-      if (t.dirty())
-        encode(c, p->data());
-      decode(b, p->data());
-    }
-    return p;
-  }
-
-  // encode block with given index
-  void encode(uint index, const Scalar* block) const
-  {
-    stream_wseek(zfp->stream, index * array->blkbits);
-    Codec::encode_block_1(zfp, block, array->shape ? array->shape[index] : 0);
-    stream_flush(zfp->stream);
-  }
-};
diff --git a/array/zfp/view2.h b/array/zfp/view2.h
deleted file mode 100644
index fcfdf8ca..00000000
--- a/array/zfp/view2.h
+++ /dev/null
@@ -1,393 +0,0 @@
-// 2D array views; these classes are nested within zfp::array2
-
-// abstract view of 2D array (base class)
-class preview {
-public:
-  // rate in bits per value
-  double rate() const { return array->rate(); }
-
-  // dimensions of (sub)array
-  size_t size() const { return size_t(nx) * size_t(ny); }
-
-  // local to global array indices
-  uint global_x(uint i) const { return x + i; }
-  uint global_y(uint j) const { return y + j; }
-
-protected:
-  // construction and assignment--perform shallow copy of (sub)array
-  explicit preview(array2* array) : array(array), x(0), y(0), nx(array->nx), ny(array->ny) {}
-  explicit preview(array2* array, uint x, uint y, uint nx, uint ny) : array(array), x(x), y(y), nx(nx), ny(ny) {}
-  preview& operator=(array2* a)
-  {
-    array = a;
-    x = y = 0;
-    nx = a->nx;
-    ny = a->ny;
-    return *this;
-  }
-
-  array2* array; // underlying container
-  uint x, y;     // offset into array
-  uint nx, ny;   // dimensions of subarray
-};
-
-// generic read-only view into a rectangular subset of a 2D array
-class const_view : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-public:
-  // construction--perform shallow copy of (sub)array
-  const_view(array2* array) : preview(array) {}
-  const_view(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {}
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-
-  // (i, j) accessor
-  Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j); }
-};
-
-// generic read-write view into a rectangular subset of a 2D array
-class view : public const_view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-public:
-  // construction--perform shallow copy of (sub)array
-  view(array2* array) : const_view(array) {}
-  view(array2* array, uint x, uint y, uint nx, uint ny) : const_view(array, x, y, nx, ny) {}
-
-  // (i, j) accessor from base class
-  using const_view::operator();
-
-  // (i, j) mutator
-  reference operator()(uint i, uint j) { return reference(array, x + i, y + j); }
-};
-
-// flat view of 2D array (operator[] returns scalar)
-class flat_view : public view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-public:
-  // construction--perform shallow copy of (sub)array
-  flat_view(array2* array) : view(array) {}
-  flat_view(array2* array, uint x, uint y, uint nx, uint ny) : view(array, x, y, nx, ny) {}
-
-  // convert (i, j) index to flat index
-  uint index(uint i, uint j) const { return i + nx * j; }
-
-  // convert flat index to (i, j) index
-  void ij(uint& i, uint& j, uint index) const
-  {
-    i = index % nx; index /= nx;
-    j = index;
-  }
-
-  // flat index accessors
-  Scalar operator[](uint index) const
-  {
-    uint i, j;
-    ij(i, j, index);
-    return array->get(x + i, y + j);
-  }
-  reference operator[](uint index)
-  {
-    uint i, j;
-    ij(i, j, index);
-    return reference(array, x + i, y + j);
-  }
-};
-
-// forward declaration of friends
-class nested_view1;
-class nested_view2;
-
-// nested view into a 1D rectangular subset of a 2D array
-class nested_view1 : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-public:
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-
-  // [i] accessor and mutator
-  Scalar operator[](uint index) const { return array->get(x + index, y); }
-  reference operator[](uint index) { return reference(array, x + index, y); }
-
-  // (i) accessor and mutator
-  Scalar operator()(uint i) const { return array->get(x + i, y); }
-  reference operator()(uint i) { return reference(array, x + i, y); }
-
-protected:
-  // construction--perform shallow copy of (sub)array
-  friend class nested_view2;
-  explicit nested_view1(array2* array) : preview(array) {}
-  explicit nested_view1(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {}
-};
-
-// nested view into a 2D rectangular subset of a 2D array
-class nested_view2 : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-public:
-  // construction--perform shallow copy of (sub)array
-  nested_view2(array2* array) : preview(array) {}
-  nested_view2(array2* array, uint x, uint y, uint nx, uint ny) : preview(array, x, y, nx, ny) {}
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-
-  // 1D view
-  nested_view1 operator[](uint index) const { return nested_view1(array, x, y + index, nx, 1); }
-
-  // (i, j) accessor and mutator
-  Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j); }
-  reference operator()(uint i, uint j) { return reference(array, x + i, y + j); }
-};
-
-typedef nested_view2 nested_view;
-
-// thread-safe read-only view of 2D (sub)array with private cache
-class private_const_view : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-public:
-  // construction--perform shallow copy of (sub)array
-  private_const_view(array2* array) :
-    preview(array),
-    cache(array->cache.size())
-  {
-    init();
-  }
-  private_const_view(array2* array, uint x, uint y, uint nx, uint ny) :
-    preview(array, x, y, nx, ny),
-    cache(array->cache.size())
-  {
-    init();
-  }
-
-  // destructor
-  ~private_const_view()
-  {
-    stream_close(zfp->stream);
-    zfp_stream_close(zfp);
-  }
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-
-  // cache size in number of bytes
-  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
-
-  // set minimum cache size in bytes (array dimensions must be known)
-  void set_cache_size(size_t csize)
-  {
-    cache.resize(array->lines(csize, nx, ny));
-  }
-
-  // empty cache without compressing modified cached blocks
-  void clear_cache() const { cache.clear(); }
-
-  // (i, j) accessor
-  Scalar operator()(uint i, uint j) const { return get(x + i, y + j); }
-
-protected:
-  // cache line representing one block of decompressed values
-  class CacheLine {
-  public:
-    const Scalar& operator()(uint i, uint j) const { return a[index(i, j)]; }
-    Scalar& operator()(uint i, uint j) { return a[index(i, j)]; }
-    const Scalar* data() const { return a; }
-    Scalar* data() { return a; }
-  protected:
-    static uint index(uint i, uint j) { return (i & 3u) + 4 * (j & 3u); }
-    Scalar a[16];
-  };
-
-  // copy private data
-  void init()
-  {
-    // copy compressed stream
-    zfp = zfp_stream_open(0);
-    *zfp = *array->zfp;
-    // copy bit stream
-    zfp->stream = stream_clone(array->zfp->stream);
-  }
-
-  // inspector
-  const Scalar& get(uint i, uint j) const
-  {
-    const CacheLine* p = line(i, j);
-    return (*p)(i, j);
-  }
-
-  // return cache line for (i, j); may require write-back and fetch
-  CacheLine* line(uint i, uint j) const
-  {
-    CacheLine* p = 0;
-    uint b = array->block(i, j);
-    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false);
-    uint c = t.index() - 1;
-    // fetch cache line; no writeback possible since view is read-only
-    if (c != b)
-      decode(b, p->data());
-    return p;
-  }
-
-  // decode block with given index
-  void decode(uint index, Scalar* block) const
-  {
-    stream_rseek(zfp->stream, index * array->blkbits);
-    Codec::decode_block_2(zfp, block, array->shape ? array->shape[index] : 0);
-  }
-
-  zfp_stream* zfp;                // stream of compressed blocks
-  mutable Cache<CacheLine> cache; // cache of decompressed blocks
-};
-
-// thread-safe read-write view of private 2D (sub)array
-class private_view : public private_const_view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::nx;
-  using preview::ny;
-  using private_const_view::zfp;
-  using private_const_view::cache;
-  using private_const_view::init;
-  using private_const_view::decode;
-  class view_reference;
-  typedef typename private_const_view::CacheLine CacheLine;
-public:
-  // construction--perform shallow copy of (sub)array
-  private_view(array2* array) : private_const_view(array) {}
-  private_view(array2* array, uint x, uint y, uint nx, uint ny) : private_const_view(array, x, y, nx, ny) {}
-
-  // partition view into count block-aligned pieces, with 0 <= index < count
-  void partition(uint index, uint count)
-  {
-    if (nx > ny)
-      partition(x, nx, index, count);
-    else
-      partition(y, ny, index, count);
-  }
-
-  // flush cache by compressing all modified cached blocks
-  void flush_cache() const
-  {
-    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
-      if (p->tag.dirty()) {
-        uint b = p->tag.index() - 1;
-        encode(b, p->line->data());
-      }
-      cache.flush(p->line);
-    }
-  }
-
-  // (i, j) accessor from base class
-  using private_const_view::operator();
-
-  // (i, j) mutator
-  view_reference operator()(uint i, uint j) { return view_reference(this, x + i, y + j); }
-
-protected:
-  class view_reference {
-  public:
-    operator Scalar() const { return view->get(i, j); }
-    view_reference operator=(const view_reference& r) { view->set(i, j, r.operator Scalar()); return *this; }
-    view_reference operator=(Scalar val) { view->set(i, j, val); return *this; }
-    view_reference operator+=(Scalar val) { view->add(i, j, val); return *this; }
-    view_reference operator-=(Scalar val) { view->sub(i, j, val); return *this; }
-    view_reference operator*=(Scalar val) { view->mul(i, j, val); return *this; }
-    view_reference operator/=(Scalar val) { view->div(i, j, val); return *this; }
-    // swap two array elements via proxy references
-    friend void swap(view_reference a, view_reference b)
-    {
-      Scalar x = a.operator Scalar();
-      Scalar y = b.operator Scalar();
-      b.operator=(x);
-      a.operator=(y);
-    }
-
-  protected:
-    friend class private_view;
-    explicit view_reference(private_view* view, uint i, uint j) : view(view), i(i), j(j) {}
-    private_view* view;
-    uint i, j;
-  };
-
-  // block-aligned partition of [offset, offset + size): index out of count
-  static void partition(uint& offset, uint& size, uint index, uint count)
-  {
-    uint bmin = offset / 4;
-    uint bmax = (offset + size + 3) / 4;
-    uint xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
-    uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
-    offset = xmin;
-    size = xmax - xmin;
-  }
-
-  // mutator
-  void set(uint i, uint j, Scalar val)
-  {
-    CacheLine* p = line(i, j, true);
-    (*p)(i, j) = val;
-  }
-
-  // in-place updates
-  void add(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) += val; }
-  void sub(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) -= val; }
-  void mul(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) *= val; }
-  void div(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) /= val; }
-
-  // return cache line for (i, j); may require write-back and fetch
-  CacheLine* line(uint i, uint j, bool write) const
-  {
-    CacheLine* p = 0;
-    uint b = array->block(i, j);
-    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
-    uint c = t.index() - 1;
-    if (c != b) {
-      // write back occupied cache line if it is dirty
-      if (t.dirty())
-        encode(c, p->data());
-      decode(b, p->data());
-    }
-    return p;
-  }
-
-  // encode block with given index
-  void encode(uint index, const Scalar* block) const
-  {
-    stream_wseek(zfp->stream, index * array->blkbits);
-    Codec::encode_block_2(zfp, block, array->shape ? array->shape[index] : 0);
-    stream_flush(zfp->stream);
-  }
-};
diff --git a/array/zfp/view3.h b/array/zfp/view3.h
deleted file mode 100644
index b1bf457f..00000000
--- a/array/zfp/view3.h
+++ /dev/null
@@ -1,445 +0,0 @@
-// 3D array views; these classes are nested within zfp::array3
-
-// abstract view of 3D array (base class)
-class preview {
-public:
-  // rate in bits per value
-  double rate() const { return array->rate(); }
-
-  // dimensions of (sub)array
-  size_t size() const { return size_t(nx) * size_t(ny) * size_t(nz); }
-
-  // local to global array indices
-  uint global_x(uint i) const { return x + i; }
-  uint global_y(uint j) const { return y + j; }
-  uint global_z(uint k) const { return z + k; }
-
-protected:
-  // construction and assignment--perform shallow copy of (sub)array
-  explicit preview(array3* array) : array(array), x(0), y(0), z(0), nx(array->nx), ny(array->ny), nz(array->nz) {}
-  explicit preview(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : array(array), x(x), y(y), z(z), nx(nx), ny(ny), nz(nz) {}
-  preview& operator=(array3* a)
-  {
-    array = a;
-    x = y = z = 0;
-    nx = a->nx;
-    ny = a->ny;
-    nz = a->nz;
-    return *this;
-  }
-
-  array3* array;   // underlying container
-  uint x, y, z;    // offset into array
-  uint nx, ny, nz; // dimensions of subarray
-};
-
-// generic read-only view into a rectangular subset of a 3D array
-class const_view : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // construction--perform shallow copy of (sub)array
-  const_view(array3* array) : preview(array) {}
-  const_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-  uint size_z() const { return nz; }
-
-  // (i, j, k) accessor
-  Scalar operator()(uint i, uint j, uint k) const { return array->get(x + i, y + j, z + k); }
-};
-
-// generic read-write view into a rectangular subset of a 3D array
-class view : public const_view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // construction--perform shallow copy of (sub)array
-  view(array3* array) : const_view(array) {}
-  view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : const_view(array, x, y, z, nx, ny, nz) {}
-
-  // (i, j, k) accessor from base class
-  using const_view::operator();
-
-  // (i, j, k) mutator
-  reference operator()(uint i, uint j, uint k) { return reference(array, x + i, y + j, z + k); }
-};
-
-// flat view of 3D array (operator[] returns scalar)
-class flat_view : public view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // construction--perform shallow copy of (sub)array
-  flat_view(array3* array) : view(array) {}
-  flat_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : view(array, x, y, z, nx, ny, nz) {}
-
-  // convert (i, j, k) index to flat index
-  uint index(uint i, uint j, uint k) const { return i + nx * (j + ny * k); }
-
-  // convert flat index to (i, j, k) index
-  void ijk(uint& i, uint& j, uint& k, uint index) const
-  {
-    i = index % nx; index /= nx;
-    j = index % ny; index /= ny;
-    k = index;
-  }
-
-  // flat index accessors
-  Scalar operator[](uint index) const
-  {
-    uint i, j, k;
-    ijk(i, j, k, index);
-    return array->get(x + i, y + j, z + k);
-  }
-  reference operator[](uint index)
-  {
-    uint i, j, k;
-    ijk(i, j, k, index);
-    return reference(array, x + i, y + j, z + k);
-  }
-};
-
-// forward declaration of friends
-class nested_view1;
-class nested_view2;
-class nested_view3;
-
-// nested view into a 1D rectangular subset of a 3D array
-class nested_view1 : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-
-  // [i] accessor and mutator
-  Scalar operator[](uint index) const { return array->get(x + index, y, z); }
-  reference operator[](uint index) { return reference(array, x + index, y, z); }
-
-  // (i) accessor and mutator
-  Scalar operator()(uint i) const { return array->get(x + i, y, z); }
-  reference operator()(uint i) { return reference(array, x + i, y, z); }
-
-protected:
-  // construction--perform shallow copy of (sub)array
-  friend class nested_view2;
-  explicit nested_view1(array3* array) : preview(array) {}
-  explicit nested_view1(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
-};
-
-// nested view into a 2D rectangular subset of a 3D array
-class nested_view2 : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-
-  // 1D view
-  nested_view1 operator[](uint index) const { return nested_view1(array, x, y + index, z, nx, 1, 1); }
-
-  // (i, j) accessor and mutator
-  Scalar operator()(uint i, uint j) const { return array->get(x + i, y + j, z); }
-  reference operator()(uint i, uint j) { return reference(array, x + i, y + j, z); }
-
-protected:
-  // construction--perform shallow copy of (sub)array
-  friend class nested_view3;
-  explicit nested_view2(array3* array) : preview(array) {}
-  explicit nested_view2(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
-};
-
-// nested view into a 3D rectangular subset of a 3D array
-class nested_view3 : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // construction--perform shallow copy of (sub)array
-  nested_view3(array3* array) : preview(array) {}
-  nested_view3(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : preview(array, x, y, z, nx, ny, nz) {}
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-  uint size_z() const { return nz; }
-
-  // 2D view
-  nested_view2 operator[](uint index) const { return nested_view2(array, x, y, z + index, nx, ny, 1); }
-
-  // (i, j, k) accessor and mutator
-  Scalar operator()(uint i, uint j, uint k) const { return array->get(x + i, y + j, z + k); }
-  reference operator()(uint i, uint j, uint k) { return reference(array, x + i, y + j, z + k); }
-};
-
-typedef nested_view3 nested_view;
-
-// thread-safe read-only view of 3D (sub)array with private cache
-class private_const_view : public preview {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-public:
-  // construction--perform shallow copy of (sub)array
-  private_const_view(array3* array) :
-    preview(array),
-    cache(array->cache.size())
-  {
-    init();
-  }
-  private_const_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) :
-    preview(array, x, y, z, nx, ny, nz),
-    cache(array->cache.size())
-  {
-    init();
-  }
-
-  // destructor
-  ~private_const_view()
-  {
-    stream_close(zfp->stream);
-    zfp_stream_close(zfp);
-  }
-
-  // dimensions of (sub)array
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-  uint size_z() const { return nz; }
-
-  // cache size in number of bytes
-  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
-
-  // set minimum cache size in bytes (array dimensions must be known)
-  void set_cache_size(size_t csize)
-  {
-    cache.resize(array->lines(csize, nx, ny, nz));
-  }
-
-  // empty cache without compressing modified cached blocks
-  void clear_cache() const { cache.clear(); }
-
-  // (i, j, k) accessor
-  Scalar operator()(uint i, uint j, uint k) const { return get(x + i, y + j, z + k); }
-
-protected:
-  // cache line representing one block of decompressed values
-  class CacheLine {
-  public:
-    const Scalar& operator()(uint i, uint j, uint k) const { return a[index(i, j, k)]; }
-    Scalar& operator()(uint i, uint j, uint k) { return a[index(i, j, k)]; }
-    const Scalar* data() const { return a; }
-    Scalar* data() { return a; }
-  protected:
-    static uint index(uint i, uint j, uint k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); }
-    Scalar a[64];
-  };
-
-  // copy private data
-  void init()
-  {
-    // copy compressed stream
-    zfp = zfp_stream_open(0);
-    *zfp = *array->zfp;
-    // copy bit stream
-    zfp->stream = stream_clone(array->zfp->stream);
-  }
-
-  // inspector
-  const Scalar& get(uint i, uint j, uint k) const
-  {
-    const CacheLine* p = line(i, j, k);
-    return (*p)(i, j, k);
-  }
-
-  // return cache line for (i, j, k); may require write-back and fetch
-  CacheLine* line(uint i, uint j, uint k) const
-  {
-    CacheLine* p = 0;
-    uint b = array->block(i, j, k);
-    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, false);
-    uint c = t.index() - 1;
-    // fetch cache line; no writeback possible since view is read-only
-    if (c != b)
-      decode(b, p->data());
-    return p;
-  }
-
-  // decode block with given index
-  void decode(uint index, Scalar* block) const
-  {
-    stream_rseek(zfp->stream, index * array->blkbits);
-    Codec::decode_block_3(zfp, block, array->shape ? array->shape[index] : 0);
-  }
-
-  zfp_stream* zfp;                // stream of compressed blocks
-  mutable Cache<CacheLine> cache; // cache of decompressed blocks
-};
-
-// thread-safe read-write view of private 3D (sub)array
-class private_view : public private_const_view {
-protected:
-  using preview::array;
-  using preview::x;
-  using preview::y;
-  using preview::z;
-  using preview::nx;
-  using preview::ny;
-  using preview::nz;
-  using private_const_view::zfp;
-  using private_const_view::cache;
-  using private_const_view::init;
-  using private_const_view::decode;
-  class view_reference;
-  typedef typename private_const_view::CacheLine CacheLine;
-public:
-  // construction--perform shallow copy of (sub)array
-  private_view(array3* array) : private_const_view(array) {}
-  private_view(array3* array, uint x, uint y, uint z, uint nx, uint ny, uint nz) : private_const_view(array, x, y, z, nx, ny, nz) {}
-
-  // partition view into count block-aligned pieces, with 0 <= index < count
-  void partition(uint index, uint count)
-  {
-    if (nx > std::max(ny, nz))
-      partition(x, nx, index, count);
-    else if (ny > std::max(nx, nz))
-      partition(y, ny, index, count);
-    else
-      partition(z, nz, index, count);
-  }
-
-  // flush cache by compressing all modified cached blocks
-  void flush_cache() const
-  {
-    for (typename Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
-      if (p->tag.dirty()) {
-        uint b = p->tag.index() - 1;
-        encode(b, p->line->data());
-      }
-      cache.flush(p->line);
-    }
-  }
-
-  // (i, j, k) accessor from base class
-  using private_const_view::operator();
-
-  // (i, j, k) mutator
-  view_reference operator()(uint i, uint j, uint k) { return view_reference(this, x + i, y + j, z + k); }
-
-protected:
-  class view_reference {
-  public:
-    operator Scalar() const { return view->get(i, j, k); }
-    view_reference operator=(const view_reference& r) { view->set(i, j, k, r.operator Scalar()); return *this; }
-    view_reference operator=(Scalar val) { view->set(i, j, k, val); return *this; }
-    view_reference operator+=(Scalar val) { view->add(i, j, k, val); return *this; }
-    view_reference operator-=(Scalar val) { view->sub(i, j, k, val); return *this; }
-    view_reference operator*=(Scalar val) { view->mul(i, j, k, val); return *this; }
-    view_reference operator/=(Scalar val) { view->div(i, j, k, val); return *this; }
-    // swap two array elements via proxy references
-    friend void swap(view_reference a, view_reference b)
-    {
-      Scalar x = a.operator Scalar();
-      Scalar y = b.operator Scalar();
-      b.operator=(x);
-      a.operator=(y);
-    }
-
-  protected:
-    friend class private_view;
-    explicit view_reference(private_view* view, uint i, uint j, uint k) : view(view), i(i), j(j), k(k) {}
-    private_view* view;
-    uint i, j, k;
-  };
-
-  // block-aligned partition of [offset, offset + size): index out of count
-  static void partition(uint& offset, uint& size, uint index, uint count)
-  {
-    uint bmin = offset / 4;
-    uint bmax = (offset + size + 3) / 4;
-    uint xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
-    uint xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
-    offset = xmin;
-    size = xmax - xmin;
-  }
-
-  // mutator
-  void set(uint i, uint j, uint k, Scalar val)
-  {
-    CacheLine* p = line(i, j, k, true);
-    (*p)(i, j, k) = val;
-  }
-
-  // in-place updates
-  void add(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) += val; }
-  void sub(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) -= val; }
-  void mul(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) *= val; }
-  void div(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) /= val; }
-
-  // return cache line for (i, j, k); may require write-back and fetch
-  CacheLine* line(uint i, uint j, uint k, bool write) const
-  {
-    CacheLine* p = 0;
-    uint b = array->block(i, j, k);
-    typename Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
-    uint c = t.index() - 1;
-    if (c != b) {
-      // write back occupied cache line if it is dirty
-      if (t.dirty())
-        encode(c, p->data());
-      decode(b, p->data());
-    }
-    return p;
-  }
-
-  // encode block with given index
-  void encode(uint index, const Scalar* block) const
-  {
-    stream_wseek(zfp->stream, index * array->blkbits);
-    Codec::encode_block_3(zfp, block, array->shape ? array->shape[index] : 0);
-    stream_flush(zfp->stream);
-  }
-};
diff --git a/array/zfparray.h b/array/zfparray.h
deleted file mode 100644
index 2ddbde42..00000000
--- a/array/zfparray.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef ZFP_ARRAY_H
-#define ZFP_ARRAY_H
-
-#include <algorithm>
-#include <climits>
-#include <cstring>
-#include <stdexcept>
-#include <string>
-
-#include "zfp.h"
-#include "zfp/memory.h"
-
-// all undefined at end
-#define DIV_ROUND_UP(x, y) (((x) + (y) - 1) / (y))
-#define BITS_TO_BYTES(x) DIV_ROUND_UP(x, CHAR_BIT)
-
-#define ZFP_HEADER_SIZE_BITS (ZFP_MAGIC_BITS + ZFP_META_BITS + ZFP_MODE_SHORT_BITS)
-
-namespace zfp {
-
-// abstract base class for compressed array of scalars
-class array {
-public:
-  #include "zfp/header.h"
-
-  static zfp::array* construct(const zfp::array::header& header, const uchar* buffer = 0, size_t buffer_size_bytes = 0);
-
-protected:
-  // default constructor
-  array() :
-    dims(0), type(zfp_type_none),
-    nx(0), ny(0), nz(0),
-    bx(0), by(0), bz(0),
-    blocks(0), blkbits(0),
-    bytes(0), data(0),
-    zfp(0),
-    shape(0)
-  {}
-
-  // generic array with 'dims' dimensions and scalar type 'type'
-  array(uint dims, zfp_type type) :
-    dims(dims), type(type),
-    nx(0), ny(0), nz(0),
-    bx(0), by(0), bz(0),
-    blocks(0), blkbits(0),
-    bytes(0), data(0),
-    zfp(zfp_stream_open(0)),
-    shape(0)
-  {}
-
-  // constructor, from previously-serialized compressed array
-  array(uint dims, zfp_type type, const zfp::array::header& h, size_t expected_buffer_size_bytes) :
-    dims(dims), type(type),
-    nx(0), ny(0), nz(0),
-    bx(0), by(0), bz(0),
-    blocks(0), blkbits(0),
-    bytes(0), data(0),
-    zfp(zfp_stream_open(0)),
-    shape(0)
-  {
-    // read header to populate member variables associated with zfp_stream
-    try {
-      read_from_header(h);
-    } catch (zfp::array::header::exception const &) {
-      zfp_stream_close(zfp);
-      throw;
-    }
-
-    if (expected_buffer_size_bytes && !is_valid_buffer_size(zfp, nx, ny, nz, expected_buffer_size_bytes)) {
-      zfp_stream_close(zfp);
-      throw zfp::array::header::exception("ZFP header expects a longer buffer than what was passed in.");
-    }
-  }
-
-  // copy constructor--performs a deep copy
-  array(const array& a) :
-    data(0),
-    zfp(0),
-    shape(0)
-  {
-    deep_copy(a);
-  }
-
-  // assignment operator--performs a deep copy
-  array& operator=(const array& a)
-  {
-    deep_copy(a);
-    return *this;
-  }
-
-public:
-  // public virtual destructor (can delete array through base class pointer)
-  virtual ~array()
-  {
-    free();
-    zfp_stream_close(zfp);
-  }
-
-  // rate in bits per value
-  double rate() const { return double(blkbits) / block_size(); }
-
-  // set compression rate in bits per value
-  double set_rate(double rate)
-  {
-    rate = zfp_stream_set_rate(zfp, rate, type, dims, 1);
-    blkbits = zfp->maxbits;
-    alloc();
-    return rate;
-  }
-
-  // empty cache without compressing modified cached blocks
-  virtual void clear_cache() const = 0;
-
-  // flush cache by compressing all modified cached blocks
-  virtual void flush_cache() const = 0;
-
-  // number of bytes of compressed data
-  size_t compressed_size() const { return bytes; }
-
-  // pointer to compressed data for read or write access
-  uchar* compressed_data() const
-  {
-    // first write back any modified cached data
-    flush_cache();
-    return data;
-  }
-
-  // dimensionality
-  uint dimensionality() const { return dims; }
-
-  // underlying scalar type
-  zfp_type scalar_type() const { return type; }
-
-  // write header with latest metadata
-  zfp::array::header get_header() const
-  {
-    // intermediate buffer needed (bitstream accesses multiples of wordsize)
-    AlignedBufferHandle abh;
-    DualBitstreamHandle dbh(zfp, abh);
-
-    ZfpFieldHandle zfh(type, nx, ny, nz);
-
-    // avoid long header (alignment issue)
-    if (zfp_stream_mode(zfp) > ZFP_MODE_SHORT_MAX)
-      throw zfp::array::header::exception("ZFP compressed arrays only support short headers at this time.");
-
-    if (!zfp_write_header(zfp, zfh.field, ZFP_HEADER_FULL))
-      throw zfp::array::header::exception("ZFP could not write a header to buffer.");
-    stream_flush(zfp->stream);
-
-    zfp::array::header h;
-    abh.copy_to_header(&h);
-
-    return h;
-  }
-
-private:
-  // private members used when reading/writing headers
-  #include "zfp/headerHelpers.h"
-
-protected:
-  // number of values per block
-  uint block_size() const { return 1u << (2 * dims); }
-
-  // allocate memory for compressed data
-  void alloc(bool clear = true)
-  {
-    bytes = blocks * blkbits / CHAR_BIT;
-    zfp::reallocate_aligned(data, bytes, 0x100u);
-    if (clear)
-      std::fill(data, data + bytes, 0);
-    stream_close(zfp->stream);
-    zfp_stream_set_bit_stream(zfp, stream_open(data, bytes));
-    clear_cache();
-  }
-
-  // free memory associated with compressed data
-  void free()
-  {
-    nx = ny = nz = 0;
-    bx = by = bz = 0;
-    blocks = 0;
-    stream_close(zfp->stream);
-    zfp_stream_set_bit_stream(zfp, 0);
-    bytes = 0;
-    zfp::deallocate_aligned(data);
-    data = 0;
-    zfp::deallocate(shape);
-    shape = 0;
-  }
-
-  // perform a deep copy
-  void deep_copy(const array& a)
-  {
-    // copy metadata
-    dims = a.dims;
-    type = a.type;
-    nx = a.nx;
-    ny = a.ny;
-    nz = a.nz;
-    bx = a.bx;
-    by = a.by;
-    bz = a.bz;
-    blocks = a.blocks;
-    blkbits = a.blkbits;
-    bytes = a.bytes;
-
-    // copy dynamically allocated data
-    zfp::clone_aligned(data, a.data, bytes, 0x100u);
-    if (zfp) {
-      if (zfp->stream)
-        stream_close(zfp->stream);
-      zfp_stream_close(zfp);
-    }
-    zfp = zfp_stream_open(0);
-    *zfp = *a.zfp;
-    zfp_stream_set_bit_stream(zfp, stream_open(data, bytes));
-    zfp::clone(shape, a.shape, blocks);
-  }
-
-  // attempt reading header from zfp::array::header
-  // and verify header contents (throws exceptions upon failure)
-  void read_from_header(const zfp::array::header& h)
-  {
-    // copy header into aligned buffer
-    AlignedBufferHandle abh(&h);
-    DualBitstreamHandle dbh(zfp, abh);
-    ZfpFieldHandle zfh;
-
-    // read header to populate member variables associated with zfp_stream
-    size_t readbits = zfp_read_header(zfp, zfh.field, ZFP_HEADER_FULL);
-    if (!readbits)
-      throw zfp::array::header::exception("Invalid ZFP header.");
-    else if (readbits != ZFP_HEADER_SIZE_BITS)
-      throw zfp::array::header::exception("ZFP compressed arrays only support short headers at this time.");
-
-    // verify metadata on zfp_field match that for this object
-    std::string err_msg = "";
-    if (type != zfp_field_type(zfh.field))
-      zfp::array::header::concat_sentence(err_msg, "ZFP header specified an underlying scalar type different than that for this object.");
-
-    if (dims != zfp_field_dimensionality(zfh.field))
-      zfp::array::header::concat_sentence(err_msg, "ZFP header specified a dimensionality different than that for this object.");
-
-    verify_header_contents(zfp, zfh.field, err_msg);
-
-    if (!err_msg.empty())
-      throw zfp::array::header::exception(err_msg);
-
-    // set class variables
-    nx = zfh.field->nx;
-    ny = zfh.field->ny;
-    nz = zfh.field->nz;
-    type = zfh.field->type;
-    blkbits = zfp->maxbits;
-  }
-
-  // default number of cache lines for array with n blocks
-  static uint lines(size_t n)
-  {
-    // compute m = O(sqrt(n))
-    size_t m;
-    for (m = 1; m * m < n; m *= 2);
-    return static_cast<uint>(m);
-  }
-
-  uint dims;           // array dimensionality (1, 2, or 3)
-  zfp_type type;       // scalar type
-  uint nx, ny, nz;     // array dimensions
-  uint bx, by, bz;     // array dimensions in number of blocks
-  uint blocks;         // number of blocks
-  size_t blkbits;      // number of bits per compressed block
-  size_t bytes;        // total bytes of compressed data
-  mutable uchar* data; // pointer to compressed data
-  zfp_stream* zfp;     // compressed stream of blocks
-  uchar* shape;        // precomputed block dimensions (or null if uniform)
-};
-
-#undef DIV_ROUND_UP
-#undef BITS_TO_BYTES
-
-#undef ZFP_HEADER_SIZE_BITS
-
-}
-
-#endif
diff --git a/array/zfparray1.h b/array/zfparray1.h
deleted file mode 100644
index f95b430d..00000000
--- a/array/zfparray1.h
+++ /dev/null
@@ -1,297 +0,0 @@
-#ifndef ZFP_ARRAY1_H
-#define ZFP_ARRAY1_H
-
-#include <cstddef>
-#include <iterator>
-#include <cstring>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfp/cache.h"
-
-namespace zfp {
-
-// compressed 1D array of scalars
-template < typename Scalar, class Codec = zfp::codec<Scalar> >
-class array1 : public array {
-public:
-  // forward declarations
-  class reference;
-  class pointer;
-  class iterator;
-  class view;
-  #include "zfp/reference1.h"
-  #include "zfp/pointer1.h"
-  #include "zfp/iterator1.h"
-  #include "zfp/view1.h"
-
-  // default constructor
-  array1() : array(1, Codec::type) {}
-
-  // constructor of n-sample array using rate bits per value, at least
-  // csize bytes of cache, and optionally initialized from flat array p
-  array1(uint n, double rate, const Scalar* p = 0, size_t csize = 0) :
-    array(1, Codec::type),
-    cache(lines(csize, n))
-  {
-    set_rate(rate);
-    resize(n, p == 0);
-    if (p)
-      set(p);
-  }
-
-  // constructor, from previously-serialized compressed array
-  array1(const zfp::array::header& h, const uchar* buffer = 0, size_t buffer_size_bytes = 0) :
-    array(1, Codec::type, h, buffer_size_bytes)
-  {
-    resize(nx, false);
-    if (buffer)
-      memcpy(data, buffer, bytes);
-  }
-
-  // copy constructor--performs a deep copy
-  array1(const array1& a) :
-    array()
-  {
-    deep_copy(a);
-  }
-
-  // construction from view--perform deep copy of (sub)array
-  template <class View>
-  array1(const View& v) :
-    array(1, Codec::type),
-    cache(lines(0, v.size_x()))
-  {
-    set_rate(v.rate());
-    resize(v.size_x(), true);
-    // initialize array in its preferred order
-    for (iterator it = begin(); it != end(); ++it)
-      *it = v(it.i());
-  }
-
-  // virtual destructor
-  virtual ~array1() {}
-
-  // assignment operator--performs a deep copy
-  array1& operator=(const array1& a)
-  {
-    if (this != &a)
-      deep_copy(a);
-    return *this;
-  }
-
-  // total number of elements in array
-  size_t size() const { return size_t(nx); }
-
-  // array dimensions
-  uint size_x() const { return nx; }
-
-  // resize the array (all previously stored data will be lost)
-  void resize(uint n, bool clear = true)
-  {
-    if (n == 0)
-      free();
-    else {
-      nx = n;
-      bx = (nx + 3) / 4;
-      blocks = bx;
-      alloc(clear);
-
-      // precompute block dimensions
-      zfp::deallocate(shape);
-      if (nx & 3u) {
-        shape = (uchar*)zfp::allocate(blocks);
-        uchar* p = shape;
-        for (uint i = 0; i < bx; i++)
-          *p++ = (i == bx - 1 ? -nx & 3u : 0);
-      }
-      else
-        shape = 0;
-    }
-  }
-
-  // cache size in number of bytes
-  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
-
-  // set minimum cache size in bytes (array dimensions must be known)
-  void set_cache_size(size_t csize)
-  {
-    flush_cache();
-    cache.resize(lines(csize, nx));
-  }
-
-  // empty cache without compressing modified cached blocks
-  void clear_cache() const { cache.clear(); }
-
-  // flush cache by compressing all modified cached blocks
-  void flush_cache() const
-  {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
-      if (p->tag.dirty()) {
-        uint b = p->tag.index() - 1;
-        encode(b, p->line->data());
-      }
-      cache.flush(p->line);
-    }
-  }
-
-  // decompress array and store at p
-  void get(Scalar* p) const
-  {
-    uint b = 0;
-    for (uint i = 0; i < bx; i++, p += 4, b++) {
-      const CacheLine* line = cache.lookup(b + 1);
-      if (line)
-        line->get(p, 1, shape ? shape[b] : 0);
-      else
-        decode(b, p, 1);
-    }
-  }
-
-  // initialize array by copying and compressing data stored at p
-  void set(const Scalar* p)
-  {
-    uint b = 0;
-    for (uint i = 0; i < bx; i++, b++, p += 4)
-      encode(b, p, 1);
-    cache.clear();
-  }
-
-  // (i) accessors
-  Scalar operator()(uint i) const { return get(i); }
-  reference operator()(uint i) { return reference(this, i); }
-
-  // flat index accessors
-  Scalar operator[](uint index) const { return get(index); }
-  reference operator[](uint index) { return reference(this, index); }
-
-  // random access iterators
-  iterator begin() { return iterator(this, 0); }
-  iterator end() { return iterator(this, nx); }
-
-protected:
-  // cache line representing one block of decompressed values
-  class CacheLine {
-  public:
-    Scalar operator()(uint i) const { return a[index(i)]; }
-    Scalar& operator()(uint i) { return a[index(i)]; }
-    const Scalar* data() const { return a; }
-    Scalar* data() { return a; }
-    // copy cache line
-    void get(Scalar* p, int sx) const
-    {
-      const Scalar* q = a;
-      for (uint x = 0; x < 4; x++, p += sx, q++)
-        *p = *q;
-    }
-    void get(Scalar* p, int sx, uint shape) const
-    {
-      if (!shape)
-        get(p, sx);
-      else {
-        // determine block dimensions
-        uint nx = 4 - (shape & 3u); shape >>= 2;
-        const Scalar* q = a;
-        for (uint x = 0; x < nx; x++, p += sx, q++)
-          *p = *q;
-      }
-    }
-  protected:
-    static uint index(uint i) { return i & 3u; }
-    Scalar a[4];
-  };
-
-  // perform a deep copy
-  void deep_copy(const array1& a)
-  {
-    // copy base class members
-    array::deep_copy(a);
-    // copy cache
-    cache = a.cache;
-  }
-
-  // inspector
-  Scalar get(uint i) const
-  {
-    const CacheLine* p = line(i, false);
-    return (*p)(i);
-  }
-
-  // mutator
-  void set(uint i, Scalar val)
-  {
-    CacheLine* p = line(i, true);
-    (*p)(i) = val;
-  }
-
-  // in-place updates
-  void add(uint i, Scalar val) { (*line(i, true))(i) += val; }
-  void sub(uint i, Scalar val) { (*line(i, true))(i) -= val; }
-  void mul(uint i, Scalar val) { (*line(i, true))(i) *= val; }
-  void div(uint i, Scalar val) { (*line(i, true))(i) /= val; }
-
-  // return cache line for i; may require write-back and fetch
-  CacheLine* line(uint i, bool write) const
-  {
-    CacheLine* p = 0;
-    uint b = block(i);
-    typename zfp::Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
-    uint c = t.index() - 1;
-    if (c != b) {
-      // write back occupied cache line if it is dirty
-      if (t.dirty())
-        encode(c, p->data());
-      // fetch cache line
-      decode(b, p->data());
-    }
-    return p;
-  }
-
-  // encode block with given index
-  void encode(uint index, const Scalar* block) const
-  {
-    stream_wseek(zfp->stream, index * blkbits);
-    Codec::encode_block_1(zfp, block, shape ? shape[index] : 0);
-    stream_flush(zfp->stream);
-  }
-
-  // encode block with given index from strided array
-  void encode(uint index, const Scalar* p, int sx) const
-  {
-    stream_wseek(zfp->stream, index * blkbits);
-    Codec::encode_block_strided_1(zfp, p, shape ? shape[index] : 0, sx);
-    stream_flush(zfp->stream);
-  }
-
-  // decode block with given index
-  void decode(uint index, Scalar* block) const
-  {
-    stream_rseek(zfp->stream, index * blkbits);
-    Codec::decode_block_1(zfp, block, shape ? shape[index] : 0);
-  }
-
-  // decode block with given index to strided array
-  void decode(uint index, Scalar* p, int sx) const
-  {
-    stream_rseek(zfp->stream, index * blkbits);
-    Codec::decode_block_strided_1(zfp, p, shape ? shape[index] : 0, sx);
-  }
-
-  // block index for i
-  static uint block(uint i) { return i / 4; }
-
-  // number of cache lines corresponding to size (or suggested size if zero)
-  static uint lines(size_t size, uint n)
-  {
-    n = size ? (size + sizeof(CacheLine) - 1) / sizeof(CacheLine) : array::lines(size_t((n + 3) / 4));
-    return std::max(n, 1u);
-  }
-
-  mutable zfp::Cache<CacheLine> cache; // cache of decompressed blocks
-};
-
-typedef array1<float> array1f;
-typedef array1<double> array1d;
-
-}
-
-#endif
diff --git a/array/zfparray2.h b/array/zfparray2.h
deleted file mode 100644
index 73dfaa8d..00000000
--- a/array/zfparray2.h
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef ZFP_ARRAY2_H
-#define ZFP_ARRAY2_H
-
-#include <cstddef>
-#include <iterator>
-#include <cstring>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfp/cache.h"
-
-namespace zfp {
-
-// compressed 2D array of scalars
-template < typename Scalar, class Codec = zfp::codec<Scalar> >
-class array2 : public array {
-public:
-  // forward declarations
-  class reference;
-  class pointer;
-  class iterator;
-  class view;
-  #include "zfp/reference2.h"
-  #include "zfp/pointer2.h"
-  #include "zfp/iterator2.h"
-  #include "zfp/view2.h"
-
-  // default constructor
-  array2() : array(2, Codec::type) {}
-
-  // constructor of nx * ny array using rate bits per value, at least
-  // csize bytes of cache, and optionally initialized from flat array p
-  array2(uint nx, uint ny, double rate, const Scalar* p = 0, size_t csize = 0) :
-    array(2, Codec::type),
-    cache(lines(csize, nx, ny))
-  {
-    set_rate(rate);
-    resize(nx, ny, p == 0);
-    if (p)
-      set(p);
-  }
-
-  // constructor, from previously-serialized compressed array
-  array2(const zfp::array::header& h, const uchar* buffer = 0, size_t buffer_size_bytes = 0) :
-    array(2, Codec::type, h, buffer_size_bytes)
-  {
-    resize(nx, ny, false);
-    if (buffer)
-      memcpy(data, buffer, bytes);
-  }
-
-  // copy constructor--performs a deep copy
-  array2(const array2& a) :
-    array()
-  {
-    deep_copy(a);
-  }
-
-  // construction from view--perform deep copy of (sub)array
-  template <class View>
-  array2(const View& v) :
-    array(2, Codec::type),
-    cache(lines(0, v.size_x(), v.size_y()))
-  {
-    set_rate(v.rate());
-    resize(v.size_x(), v.size_y(), true);
-    // initialize array in its preferred order
-    for (iterator it = begin(); it != end(); ++it)
-      *it = v(it.i(), it.j());
-  }
-
-  // virtual destructor
-  virtual ~array2() {}
-
-  // assignment operator--performs a deep copy
-  array2& operator=(const array2& a)
-  {
-    if (this != &a)
-      deep_copy(a);
-    return *this;
-  }
-
-  // total number of elements in array
-  size_t size() const { return size_t(nx) * size_t(ny); }
-
-  // array dimensions
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-
-  // resize the array (all previously stored data will be lost)
-  void resize(uint nx, uint ny, bool clear = true)
-  {
-    if (nx == 0 || ny == 0)
-      free();
-    else {
-      this->nx = nx;
-      this->ny = ny;
-      bx = (nx + 3) / 4;
-      by = (ny + 3) / 4;
-      blocks = bx * by;
-      alloc(clear);
-
-      // precompute block dimensions
-      zfp::deallocate(shape);
-      if ((nx | ny) & 3u) {
-        shape = (uchar*)zfp::allocate(blocks);
-        uchar* p = shape;
-        for (uint j = 0; j < by; j++)
-          for (uint i = 0; i < bx; i++)
-            *p++ = (i == bx - 1 ? -nx & 3u : 0) + 4 * (j == by - 1 ? -ny & 3u : 0);
-      }
-      else
-        shape = 0;
-    }
-  }
-
-  // cache size in number of bytes
-  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
-
-  // set minimum cache size in bytes (array dimensions must be known)
-  void set_cache_size(size_t csize)
-  {
-    flush_cache();
-    cache.resize(lines(csize, nx, ny));
-  }
-
-  // empty cache without compressing modified cached blocks
-  void clear_cache() const { cache.clear(); }
-
-  // flush cache by compressing all modified cached blocks
-  void flush_cache() const
-  {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
-      if (p->tag.dirty()) {
-        uint b = p->tag.index() - 1;
-        encode(b, p->line->data());
-      }
-      cache.flush(p->line);
-    }
-  }
-
-  // decompress array and store at p
-  void get(Scalar* p) const
-  {
-    uint b = 0;
-    for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
-      for (uint i = 0; i < bx; i++, p += 4, b++) {
-        const CacheLine* line = cache.lookup(b + 1);
-        if (line)
-          line->get(p, 1, nx, shape ? shape[b] : 0);
-        else
-          decode(b, p, 1, nx);
-      }
-  }
-
-  // initialize array by copying and compressing data stored at p
-  void set(const Scalar* p)
-  {
-    uint b = 0;
-    for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
-      for (uint i = 0; i < bx; i++, p += 4, b++)
-        encode(b, p, 1, nx);
-    cache.clear();
-  }
-
-  // (i, j) accessors
-  Scalar operator()(uint i, uint j) const { return get(i, j); }
-  reference operator()(uint i, uint j) { return reference(this, i, j); }
-
-  // flat index accessors
-  Scalar operator[](uint index) const
-  {
-    uint i, j;
-    ij(i, j, index);
-    return get(i, j);
-  }
-  reference operator[](uint index)
-  {
-    uint i, j;
-    ij(i, j, index);
-    return reference(this, i, j);
-  }
-
-  // sequential iterators
-  iterator begin() { return iterator(this, 0, 0); }
-  iterator end() { return iterator(this, 0, ny); }
-
-protected:
-  // cache line representing one block of decompressed values
-  class CacheLine {
-  public:
-    Scalar operator()(uint i, uint j) const { return a[index(i, j)]; }
-    Scalar& operator()(uint i, uint j) { return a[index(i, j)]; }
-    const Scalar* data() const { return a; }
-    Scalar* data() { return a; }
-    // copy cache line
-    void get(Scalar* p, int sx, int sy) const
-    {
-      const Scalar* q = a;
-      for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
-        for (uint x = 0; x < 4; x++, p += sx, q++)
-          *p = *q;
-    }
-    void get(Scalar* p, int sx, int sy, uint shape) const
-    {
-      if (!shape)
-        get(p, sx, sy);
-      else {
-        // determine block dimensions
-        uint nx = 4 - (shape & 3u); shape >>= 2;
-        uint ny = 4 - (shape & 3u); shape >>= 2;
-        const Scalar* q = a;
-        for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
-          for (uint x = 0; x < nx; x++, p += sx, q++)
-            *p = *q;
-      }
-    }
-  protected:
-    static uint index(uint i, uint j) { return (i & 3u) + 4 * (j & 3u); }
-    Scalar a[16];
-  };
-
-  // perform a deep copy
-  void deep_copy(const array2& a)
-  {
-    // copy base class members
-    array::deep_copy(a);
-    // copy cache
-    cache = a.cache;
-  }
-
-  // inspector
-  Scalar get(uint i, uint j) const
-  {
-    const CacheLine* p = line(i, j, false);
-    return (*p)(i, j);
-  }
-
-  // mutator
-  void set(uint i, uint j, Scalar val)
-  {
-    CacheLine* p = line(i, j, true);
-    (*p)(i, j) = val;
-  }
-
-  // in-place updates
-  void add(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) += val; }
-  void sub(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) -= val; }
-  void mul(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) *= val; }
-  void div(uint i, uint j, Scalar val) { (*line(i, j, true))(i, j) /= val; }
-
-  // return cache line for (i, j); may require write-back and fetch
-  CacheLine* line(uint i, uint j, bool write) const
-  {
-    CacheLine* p = 0;
-    uint b = block(i, j);
-    typename zfp::Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
-    uint c = t.index() - 1;
-    if (c != b) {
-      // write back occupied cache line if it is dirty
-      if (t.dirty())
-        encode(c, p->data());
-      // fetch cache line
-      decode(b, p->data());
-    }
-    return p;
-  }
-
-  // encode block with given index
-  void encode(uint index, const Scalar* block) const
-  {
-    stream_wseek(zfp->stream, index * blkbits);
-    Codec::encode_block_2(zfp, block, shape ? shape[index] : 0);
-    stream_flush(zfp->stream);
-  }
-
-  // encode block with given index from strided array
-  void encode(uint index, const Scalar* p, int sx, int sy) const
-  {
-    stream_wseek(zfp->stream, index * blkbits);
-    Codec::encode_block_strided_2(zfp, p, shape ? shape[index] : 0, sx, sy);
-    stream_flush(zfp->stream);
-  }
-
-  // decode block with given index
-  void decode(uint index, Scalar* block) const
-  {
-    stream_rseek(zfp->stream, index * blkbits);
-    Codec::decode_block_2(zfp, block, shape ? shape[index] : 0);
-  }
-
-  // decode block with given index to strided array
-  void decode(uint index, Scalar* p, int sx, int sy) const
-  {
-    stream_rseek(zfp->stream, index * blkbits);
-    Codec::decode_block_strided_2(zfp, p, shape ? shape[index] : 0, sx, sy);
-  }
-
-  // block index for (i, j)
-  uint block(uint i, uint j) const { return (i / 4) + bx * (j / 4); }
-
-  // convert flat index to (i, j)
-  void ij(uint& i, uint& j, uint index) const
-  {
-    i = index % nx;
-    index /= nx;
-    j = index;
-  }
-
-  // number of cache lines corresponding to size (or suggested size if zero)
-  static uint lines(size_t size, uint nx, uint ny)
-  {
-    uint n = size ? uint((size + sizeof(CacheLine) - 1) / sizeof(CacheLine)) : array::lines(size_t((nx + 3) / 4) * size_t((ny + 3) / 4));
-    return std::max(n, 1u);
-  }
-
-  mutable zfp::Cache<CacheLine> cache; // cache of decompressed blocks
-};
-
-typedef array2<float> array2f;
-typedef array2<double> array2d;
-
-}
-
-#endif
diff --git a/array/zfparray3.h b/array/zfparray3.h
deleted file mode 100644
index f0f42e88..00000000
--- a/array/zfparray3.h
+++ /dev/null
@@ -1,338 +0,0 @@
-#ifndef ZFP_ARRAY3_H
-#define ZFP_ARRAY3_H
-
-#include <cstddef>
-#include <iterator>
-#include <cstring>
-#include "zfparray.h"
-#include "zfpcodec.h"
-#include "zfp/cache.h"
-
-namespace zfp {
-
-// compressed 3D array of scalars
-template < typename Scalar, class Codec = zfp::codec<Scalar> >
-class array3 : public array {
-public:
-  // forward declarations
-  class reference;
-  class pointer;
-  class iterator;
-  class view;
-  #include "zfp/reference3.h"
-  #include "zfp/pointer3.h"
-  #include "zfp/iterator3.h"
-  #include "zfp/view3.h"
-
-  // default constructor
-  array3() : array(3, Codec::type) {}
-
-  // constructor of nx * ny * nz array using rate bits per value, at least
-  // csize bytes of cache, and optionally initialized from flat array p
-  array3(uint nx, uint ny, uint nz, double rate, const Scalar* p = 0, size_t csize = 0) :
-    array(3, Codec::type),
-    cache(lines(csize, nx, ny, nz))
-  {
-    set_rate(rate);
-    resize(nx, ny, nz, p == 0);
-    if (p)
-      set(p);
-  }
-
-  // constructor, from previously-serialized compressed array
-  array3(const zfp::array::header& h, const uchar* buffer = 0, size_t buffer_size_bytes = 0) :
-    array(3, Codec::type, h, buffer_size_bytes)
-  {
-    resize(nx, ny, nz, false);
-    if (buffer)
-      memcpy(data, buffer, bytes);
-  }
-
-  // copy constructor--performs a deep copy
-  array3(const array3& a) :
-    array()
-  {
-    deep_copy(a);
-  }
-
-  // construction from view--perform deep copy of (sub)array
-  template <class View>
-  array3(const View& v) :
-    array(3, Codec::type),
-    cache(lines(0, v.size_x(), v.size_y(), v.size_z()))
-  {
-    set_rate(v.rate());
-    resize(v.size_x(), v.size_y(), v.size_z(), true);
-    // initialize array in its preferred order
-    for (iterator it = begin(); it != end(); ++it)
-      *it = v(it.i(), it.j(), it.k());
-  }
-
-  // virtual destructor
-  virtual ~array3() {}
-
-  // assignment operator--performs a deep copy
-  array3& operator=(const array3& a)
-  {
-    if (this != &a)
-      deep_copy(a);
-    return *this;
-  }
-
-  // total number of elements in array
-  size_t size() const { return size_t(nx) * size_t(ny) * size_t(nz); }
-
-  // array dimensions
-  uint size_x() const { return nx; }
-  uint size_y() const { return ny; }
-  uint size_z() const { return nz; }
-
-  // resize the array (all previously stored data will be lost)
-  void resize(uint nx, uint ny, uint nz, bool clear = true)
-  {
-    if (nx == 0 || ny == 0 || nz == 0)
-      free();
-    else {
-      this->nx = nx;
-      this->ny = ny;
-      this->nz = nz;
-      bx = (nx + 3) / 4;
-      by = (ny + 3) / 4;
-      bz = (nz + 3) / 4;
-      blocks = bx * by * bz;
-      alloc(clear);
-
-      // precompute block dimensions
-      zfp::deallocate(shape);
-      if ((nx | ny | nz) & 3u) {
-        shape = (uchar*)zfp::allocate(blocks);
-        uchar* p = shape;
-        for (uint k = 0; k < bz; k++)
-          for (uint j = 0; j < by; j++)
-            for (uint i = 0; i < bx; i++)
-              *p++ = (i == bx - 1 ? -nx & 3u : 0) + 4 * ((j == by - 1 ? -ny & 3u : 0) + 4 * (k == bz - 1 ? -nz & 3u : 0));
-      }
-      else
-        shape = 0;
-    }
-  }
-
-  // cache size in number of bytes
-  size_t cache_size() const { return cache.size() * sizeof(CacheLine); }
-
-  // set minimum cache size in bytes (array dimensions must be known)
-  void set_cache_size(size_t csize)
-  {
-    flush_cache();
-    cache.resize(lines(csize, nx, ny, nz));
-  }
-
-  // empty cache without compressing modified cached blocks
-  void clear_cache() const { cache.clear(); }
-
-  // flush cache by compressing all modified cached blocks
-  void flush_cache() const
-  {
-    for (typename zfp::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
-      if (p->tag.dirty()) {
-        uint b = p->tag.index() - 1;
-        encode(b, p->line->data());
-      }
-      cache.flush(p->line);
-    }
-  }
-
-  // decompress array and store at p
-  void get(Scalar* p) const
-  {
-    uint b = 0;
-    for (uint k = 0; k < bz; k++, p += 4 * nx * (ny - by))
-      for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
-        for (uint i = 0; i < bx; i++, p += 4, b++) {
-          const CacheLine* line = cache.lookup(b + 1);
-          if (line)
-            line->get(p, 1, nx, nx * ny, shape ? shape[b] : 0);
-          else
-            decode(b, p, 1, nx, nx * ny);
-        }
-  }
-
-  // initialize array by copying and compressing data stored at p
-  void set(const Scalar* p)
-  {
-    uint b = 0;
-    for (uint k = 0; k < bz; k++, p += 4 * nx * (ny - by))
-      for (uint j = 0; j < by; j++, p += 4 * (nx - bx))
-        for (uint i = 0; i < bx; i++, p += 4, b++)
-          encode(b, p, 1, nx, nx * ny);
-    cache.clear();
-  }
-
-  // (i, j, k) accessors
-  Scalar operator()(uint i, uint j, uint k) const { return get(i, j, k); }
-  reference operator()(uint i, uint j, uint k) { return reference(this, i, j, k); }
-
-  // flat index corresponding to (i, j, k)
-  uint index(uint i, uint j, uint k) const { return i + nx * (j + ny * k); }
-
-  // flat index accessors
-  Scalar operator[](uint index) const
-  {
-    uint i, j, k;
-    ijk(i, j, k, index);
-    return get(i, j, k);
-  }
-  reference operator[](uint index)
-  {
-    uint i, j, k;
-    ijk(i, j, k, index);
-    return reference(this, i, j, k);
-  }
-
-  // sequential iterators
-  iterator begin() { return iterator(this, 0, 0, 0); }
-  iterator end() { return iterator(this, 0, 0, nz); }
-
-protected:
-  // cache line representing one block of decompressed values
-  class CacheLine {
-  public:
-    Scalar operator()(uint i, uint j, uint k) const { return a[index(i, j, k)]; }
-    Scalar& operator()(uint i, uint j, uint k) { return a[index(i, j, k)]; }
-    const Scalar* data() const { return a; }
-    Scalar* data() { return a; }
-    // copy cache line
-    void get(Scalar* p, int sx, int sy, int sz) const
-    {
-      const Scalar* q = a;
-      for (uint z = 0; z < 4; z++, p += sz - 4 * sy)
-        for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
-          for (uint x = 0; x < 4; x++, p += sx, q++)
-            *p = *q;
-    }
-    void get(Scalar* p, int sx, int sy, int sz, uint shape) const
-    {
-      if (!shape)
-        get(p, sx, sy, sz);
-      else {
-        // determine block dimensions
-        uint nx = 4 - (shape & 3u); shape >>= 2;
-        uint ny = 4 - (shape & 3u); shape >>= 2;
-        uint nz = 4 - (shape & 3u); shape >>= 2;
-        const Scalar* q = a;
-        for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
-          for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
-            for (uint x = 0; x < nx; x++, p += sx, q++)
-              *p = *q;
-      }
-    }
-  protected:
-    static uint index(uint i, uint j, uint k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); }
-    Scalar a[64];
-  };
-
-  // perform a deep copy
-  void deep_copy(const array3& a)
-  {
-    // copy base class members
-    array::deep_copy(a);
-    // copy cache
-    cache = a.cache;
-  }
-
-  // inspector
-  Scalar get(uint i, uint j, uint k) const
-  {
-    const CacheLine* p = line(i, j, k, false);
-    return (*p)(i, j, k);
-  }
-
-  // mutator
-  void set(uint i, uint j, uint k, Scalar val)
-  {
-    CacheLine* p = line(i, j, k, true);
-    (*p)(i, j, k) = val;
-  }
-
-  // in-place updates
-  void add(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) += val; }
-  void sub(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) -= val; }
-  void mul(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) *= val; }
-  void div(uint i, uint j, uint k, Scalar val) { (*line(i, j, k, true))(i, j, k) /= val; }
-
-  // return cache line for (i, j, k); may require write-back and fetch
-  CacheLine* line(uint i, uint j, uint k, bool write) const
-  {
-    CacheLine* p = 0;
-    uint b = block(i, j, k);
-    typename zfp::Cache<CacheLine>::Tag t = cache.access(p, b + 1, write);
-    uint c = t.index() - 1;
-    if (c != b) {
-      // write back occupied cache line if it is dirty
-      if (t.dirty())
-        encode(c, p->data());
-      // fetch cache line
-      decode(b, p->data());
-    }
-    return p;
-  }
-
-  // encode block with given index
-  void encode(uint index, const Scalar* block) const
-  {
-    stream_wseek(zfp->stream, index * blkbits);
-    Codec::encode_block_3(zfp, block, shape ? shape[index] : 0);
-    stream_flush(zfp->stream);
-  }
-
-  // encode block with given index from strided array
-  void encode(uint index, const Scalar* p, int sx, int sy, int sz) const
-  {
-    stream_wseek(zfp->stream, index * blkbits);
-    Codec::encode_block_strided_3(zfp, p, shape ? shape[index] : 0, sx, sy, sz);
-    stream_flush(zfp->stream);
-  }
-
-  // decode block with given index
-  void decode(uint index, Scalar* block) const
-  {
-    stream_rseek(zfp->stream, index * blkbits);
-    Codec::decode_block_3(zfp, block, shape ? shape[index] : 0);
-  }
-
-  // decode block with given index to strided array
-  void decode(uint index, Scalar* p, int sx, int sy, int sz) const
-  {
-    stream_rseek(zfp->stream, index * blkbits);
-    Codec::decode_block_strided_3(zfp, p, shape ? shape[index] : 0, sx, sy, sz);
-  }
-
-  // block index for (i, j, k)
-  uint block(uint i, uint j, uint k) const { return (i / 4) + bx * ((j / 4) + by * (k / 4)); }
-
-  // convert flat index to (i, j, k)
-  void ijk(uint& i, uint& j, uint& k, uint index) const
-  {
-    i = index % nx;
-    index /= nx;
-    j = index % ny;
-    index /= ny;
-    k = index;
-  }
-
-  // number of cache lines corresponding to size (or suggested size if zero)
-  static uint lines(size_t size, uint nx, uint ny, uint nz)
-  {
-    uint n = size ? (size + sizeof(CacheLine) - 1) / sizeof(CacheLine) : array::lines(size_t((nx + 3) / 4) * size_t((ny + 3) / 4) * size_t((nz + 3) / 4));
-    return std::max(n, 1u);
-  }
-
-  mutable zfp::Cache<CacheLine> cache; // cache of decompressed blocks
-};
-
-typedef array3<float> array3f;
-typedef array3<double> array3d;
-
-}
-
-#endif
diff --git a/array/zfpcodec.h b/array/zfpcodec.h
deleted file mode 100644
index 2d467444..00000000
--- a/array/zfpcodec.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef ZFP_CODEC_H
-#define ZFP_CODEC_H
-
-#include "zfp.h"
-
-namespace zfp {
-
-// C++ wrappers around libzfp C functions
-template <typename Scalar>
-struct codec {};
-
-#include "zfpcodecf.h"
-#include "zfpcodecd.h"
-
-}
-
-#endif
diff --git a/array/zfpcodecd.h b/array/zfpcodecd.h
deleted file mode 100644
index 9e7d8932..00000000
--- a/array/zfpcodecd.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// double-precision codec
-template <>
-struct codec<double> {
-  // encode contiguous 1D block
-  static void encode_block_1(zfp_stream* zfp, const double* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_double_1(zfp, block, nx, 1);
-    }
-    else
-      zfp_encode_block_double_1(zfp, block);
-  }
-
-  // encode 1D block from strided storage
-  static void encode_block_strided_1(zfp_stream* zfp, const double* p, uint shape, int sx)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_double_1(zfp, p, nx, sx);
-    }
-    else
-      zfp_encode_block_strided_double_1(zfp, p, sx);
-  }
-
-  // encode contiguous 2D block
-  static void encode_block_2(zfp_stream* zfp, const double* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_double_2(zfp, block, nx, ny, 1, 4);
-    }
-    else
-      zfp_encode_block_double_2(zfp, block);
-  }
-
-  // encode 2D block from strided storage
-  static void encode_block_strided_2(zfp_stream* zfp, const double* p, uint shape, int sx, int sy)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy);
-    }
-    else
-      zfp_encode_block_strided_double_2(zfp, p, sx, sy);
-  }
-
-  // encode contiguous 3D block
-  static void encode_block_3(zfp_stream* zfp, const double* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_double_3(zfp, block, nx, ny, nz, 1, 4, 16);
-    }
-    else
-      zfp_encode_block_double_3(zfp, block);
-  }
-
-  // encode 3D block from strided storage
-  static void encode_block_strided_3(zfp_stream* zfp, const double* p, uint shape, int sx, int sy, int sz)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      zfp_encode_block_strided_double_3(zfp, p, sx, sy, sz);
-  }
-
-  // decode contiguous 1D block
-  static void decode_block_1(zfp_stream* zfp, double* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_double_1(zfp, block, nx, 1);
-    }
-    else
-      zfp_decode_block_double_1(zfp, block);
-  }
-
-  // decode 1D block to strided storage
-  static void decode_block_strided_1(zfp_stream* zfp, double* p, uint shape, int sx)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_double_1(zfp, p, nx, sx);
-    }
-    else
-      zfp_decode_block_strided_double_1(zfp, p, sx);
-  }
-
-  // decode contiguous 2D block
-  static void decode_block_2(zfp_stream* zfp, double* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_double_2(zfp, block, nx, ny, 1, 4);
-    }
-    else
-      zfp_decode_block_double_2(zfp, block);
-  }
-
-  // decode 2D block to strided storage
-  static void decode_block_strided_2(zfp_stream* zfp, double* p, uint shape, int sx, int sy)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy);
-    }
-    else
-      zfp_decode_block_strided_double_2(zfp, p, sx, sy);
-  }
-
-  // decode contiguous 3D block
-  static void decode_block_3(zfp_stream* zfp, double* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_double_3(zfp, block, nx, ny, nz, 1, 4, 16);
-    }
-    else
-      zfp_decode_block_double_3(zfp, block);
-  }
-
-  // decode 3D block to strided storage
-  static void decode_block_strided_3(zfp_stream* zfp, double* p, uint shape, int sx, int sy, int sz)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      zfp_decode_block_strided_double_3(zfp, p, sx, sy, sz);
-  }
-
-  static const zfp_type type = zfp_type_double;
-};
diff --git a/array/zfpcodecf.h b/array/zfpcodecf.h
deleted file mode 100644
index 1ec74a60..00000000
--- a/array/zfpcodecf.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// single-precision codec
-template <>
-struct codec<float> {
-  // encode contiguous 1D block
-  static void encode_block_1(zfp_stream* zfp, const float* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_float_1(zfp, block, nx, 1);
-    }
-    else
-      zfp_encode_block_float_1(zfp, block);
-  }
-
-  // encode 1D block from strided storage
-  static void encode_block_strided_1(zfp_stream* zfp, const float* p, uint shape, int sx)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_float_1(zfp, p, nx, sx);
-    }
-    else
-      zfp_encode_block_strided_float_1(zfp, p, sx);
-  }
-
-  // encode contiguous 2D block
-  static void encode_block_2(zfp_stream* zfp, const float* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_float_2(zfp, block, nx, ny, 1, 4);
-    }
-    else
-      zfp_encode_block_float_2(zfp, block);
-  }
-
-  // encode 2D block from strided storage
-  static void encode_block_strided_2(zfp_stream* zfp, const float* p, uint shape, int sx, int sy)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy);
-    }
-    else
-      zfp_encode_block_strided_float_2(zfp, p, sx, sy);
-  }
-
-  // encode contiguous 3D block
-  static void encode_block_3(zfp_stream* zfp, const float* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_float_3(zfp, block, nx, ny, nz, 1, 4, 16);
-    }
-    else
-      zfp_encode_block_float_3(zfp, block);
-  }
-
-  // encode 3D block from strided storage
-  static void encode_block_strided_3(zfp_stream* zfp, const float* p, uint shape, int sx, int sy, int sz)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_encode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      zfp_encode_block_strided_float_3(zfp, p, sx, sy, sz);
-  }
-
-  // decode contiguous 1D block
-  static void decode_block_1(zfp_stream* zfp, float* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_float_1(zfp, block, nx, 1);
-    }
-    else
-      zfp_decode_block_float_1(zfp, block);
-  }
-
-  // decode 1D block to strided storage
-  static void decode_block_strided_1(zfp_stream* zfp, float* p, uint shape, int sx)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_float_1(zfp, p, nx, sx);
-    }
-    else
-      zfp_decode_block_strided_float_1(zfp, p, sx);
-  }
-
-  // decode contiguous 2D block
-  static void decode_block_2(zfp_stream* zfp, float* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_float_2(zfp, block, nx, ny, 1, 4);
-    }
-    else
-      zfp_decode_block_float_2(zfp, block);
-  }
-
-  // decode 2D block to strided storage
-  static void decode_block_strided_2(zfp_stream* zfp, float* p, uint shape, int sx, int sy)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy);
-    }
-    else
-      zfp_decode_block_strided_float_2(zfp, p, sx, sy);
-  }
-
-  // decode contiguous 3D block
-  static void decode_block_3(zfp_stream* zfp, float* block, uint shape)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_float_3(zfp, block, nx, ny, nz, 1, 4, 16);
-    }
-    else
-      zfp_decode_block_float_3(zfp, block);
-  }
-
-  // decode 3D block to strided storage
-  static void decode_block_strided_3(zfp_stream* zfp, float* p, uint shape, int sx, int sy, int sz)
-  {
-    if (shape) {
-      uint nx = 4 - (shape & 3u); shape >>= 2;
-      uint ny = 4 - (shape & 3u); shape >>= 2;
-      uint nz = 4 - (shape & 3u); shape >>= 2;
-      zfp_decode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz);
-    }
-    else
-      zfp_decode_block_strided_float_3(zfp, p, sx, sy, sz);
-  }
-
-  static const zfp_type type = zfp_type_float;
-};
diff --git a/array/zfpfactory.h b/array/zfpfactory.h
deleted file mode 100644
index 44910bd2..00000000
--- a/array/zfpfactory.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef ZFP_FACTORY_H
-#define ZFP_FACTORY_H
-
-// (assumes zfparray.h already included)
-
-zfp::array* zfp::array::construct(const zfp::array::header& header, const uchar* buffer, size_t buffer_size_bytes)
-{
-  // gather array metadata via C API, then construct with metadata
-  uint dims = 0;
-  zfp_type type = zfp_type_none;
-  double rate = 0;
-  uint n[4] = {0};
-
-  // read once (will throw if reads a noncompatible header)
-  zfp::array::read_header_contents(header, buffer_size_bytes, dims, type, rate, n);
-
-  // construct once (passing zfp::array::header will read it again)
-  zfp::array* arr = 0;
-  std::string err_msg = "";
-  switch (dims) {
-    case 3:
-#ifdef ZFP_ARRAY3_H
-      switch (type) {
-        case zfp_type_double:
-          arr = new zfp::array3d(n[0], n[1], n[2], rate);
-          break;
-
-        case zfp_type_float:
-          arr = new zfp::array3f(n[0], n[1], n[2], rate);
-          break;
-
-        default:
-          /* NOTREACHED */
-          err_msg = "Unexpected ZFP type.";
-          break;
-      }
-#else
-      err_msg = "Header files for 3 dimensional ZFP compressed arrays were not included.";
-#endif
-      break;
-
-    case 2:
-#ifdef ZFP_ARRAY2_H
-      switch (type) {
-        case zfp_type_double:
-          arr = new zfp::array2d(n[0], n[1], rate);
-          break;
-
-        case zfp_type_float:
-          arr = new zfp::array2f(n[0], n[1], rate);
-          break;
-
-        default:
-          /* NOTREACHED */
-          err_msg = "Unexpected ZFP type.";
-          break;
-      }
-#else
-      err_msg = "Header files for 2 dimensional ZFP compressed arrays were not included.";
-#endif
-      break;
-
-    case 1:
-#ifdef ZFP_ARRAY1_H
-      switch (type) {
-        case zfp_type_double:
-          arr = new zfp::array1d(n[0], rate);
-          break;
-
-        case zfp_type_float:
-          arr = new zfp::array1f(n[0], rate);
-          break;
-
-        default:
-          /* NOTREACHED */
-          err_msg = "Unexpected ZFP type.";
-          break;
-      }
-#else
-      err_msg = "Header files for 1 dimensional ZFP compressed arrays were not included.";
-#endif
-      break;
-
-    default:
-      err_msg = "ZFP compressed arrays do not yet support dimensionalities beyond 1, 2, and 3.";
-      break;
-  }
-
-  if (!err_msg.empty())
-    throw zfp::array::header::exception(err_msg);
-
-  if (buffer)
-    memcpy(arr->compressed_data(), buffer, arr->compressed_size());
-
-  return arr;
-}
-
-#endif
diff --git a/cfp/CMakeLists.txt b/cfp/CMakeLists.txt
index febd4f0a..3d8af6ec 100644
--- a/cfp/CMakeLists.txt
+++ b/cfp/CMakeLists.txt
@@ -1 +1,36 @@
-add_subdirectory(src)
+add_library(cfp cfp.cpp)
+
+if(DEFINED CFP_NAMESPACE)
+  list(APPEND cfp_public_defs "CFP_NAMESPACE=${CFP_NAMESPACE}")
+endif()
+
+list(APPEND cfp_private_defs ${zfp_compressed_array_defs})
+
+if(WIN32 AND BUILD_SHARED_LIBS)
+  # define ZFP_SOURCE when compiling libcfp to export symbols to Windows DLL
+  list(APPEND cfp_public_defs ZFP_SHARED_LIBS)
+  list(APPEND cfp_private_defs ZFP_SOURCE)
+endif()
+
+target_compile_definitions(cfp
+  PUBLIC ${cfp_public_defs}
+  PRIVATE ${cfp_private_defs})
+
+target_include_directories(cfp
+  PUBLIC
+    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PRIVATE
+    ${ZFP_SOURCE_DIR}/src
+)
+
+target_link_libraries(cfp zfp)
+
+set_property(TARGET cfp PROPERTY VERSION ${ZFP_VERSION})
+set_property(TARGET cfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
+set_property(TARGET cfp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}cfp)
+
+install(TARGETS cfp EXPORT cfp-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/cfp/src/Makefile b/cfp/Makefile
similarity index 59%
rename from cfp/src/Makefile
rename to cfp/Makefile
index eef12ffc..37881a76 100644
--- a/cfp/src/Makefile
+++ b/cfp/Makefile
@@ -1,9 +1,9 @@
-include ../../Config
+include ../Config
 
-CXXFLAGS += -I../../include -I../../src -I../../array
-LIBDIR = ../../lib
+LIBDIR = ../lib
 TARGETS = $(LIBDIR)/libcfp.a $(LIBDIR)/libcfp.so
-OBJECTS = cfparrays.o
+OBJECTS = cfp.o
+INCS = -I../include -I../src
 
 static: $(LIBDIR)/libcfp.a
 
@@ -19,7 +19,7 @@ $(LIBDIR)/libcfp.a: $(OBJECTS)
 
 $(LIBDIR)/libcfp.so: $(OBJECTS)
 	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXLAGS) -shared $(SOFLAGS) $^ -o $@
+	$(CXX) $(CXXFLAGS) -shared $(SOFLAGS) $^ -o $@
 
 .cpp.o:
-	$(CXX) $(CXXFLAGS) -c $<
+	$(CXX) $(CXXFLAGS) $(INCS) -c $<
diff --git a/cfp/cfp.cpp b/cfp/cfp.cpp
new file mode 100644
index 00000000..b360760f
--- /dev/null
+++ b/cfp/cfp.cpp
@@ -0,0 +1,868 @@
+#include "cfpheader.cpp"
+#include "zfp/array.h"
+
+#include "cfparray1f.cpp"
+#include "cfparray1d.cpp"
+#include "cfparray2f.cpp"
+#include "cfparray2d.cpp"
+#include "cfparray3f.cpp"
+#include "cfparray3d.cpp"
+#include "cfparray4f.cpp"
+#include "cfparray4d.cpp"
+
+const cfp_api CFP_NAMESPACE = {
+  // array1f
+  {
+    cfp_array1f_ctor_default,
+    cfp_array1f_ctor,
+    cfp_array1f_ctor_copy,
+    cfp_array1f_ctor_header,
+    cfp_array1f_dtor,
+
+    cfp_array1f_deep_copy,
+
+    cfp_array1f_rate,
+    cfp_array1f_set_rate,
+    cfp_array1f_cache_size,
+    cfp_array1f_set_cache_size,
+    cfp_array1f_clear_cache,
+    cfp_array1f_flush_cache,
+    cfp_array1f_size_bytes,
+    cfp_array1f_compressed_size,
+    cfp_array1f_compressed_data,
+    cfp_array1f_size,
+    cfp_array1f_resize,
+
+    cfp_array1f_get_array,
+    cfp_array1f_set_array,
+    cfp_array1f_get_flat,
+    cfp_array1f_set_flat,
+    cfp_array1f_get,
+    cfp_array1f_set,
+
+    cfp_array1f_ref,
+    cfp_array1f_ref_flat,
+
+    cfp_array1f_ptr,
+    cfp_array1f_ptr_flat,
+
+    cfp_array1f_begin,
+    cfp_array1f_end,
+
+    {
+      cfp_array1f_cfp_ref1f_get,
+      cfp_array1f_cfp_ref1f_set,
+      cfp_array1f_cfp_ref1f_ptr,
+      cfp_array1f_cfp_ref1f_copy,
+    },
+
+    {
+      cfp_array1f_cfp_ptr1f_get,
+      cfp_array1f_cfp_ptr1f_get_at,
+      cfp_array1f_cfp_ptr1f_set,
+      cfp_array1f_cfp_ptr1f_set_at,
+      cfp_array1f_cfp_ptr1f_ref,
+      cfp_array1f_cfp_ptr1f_ref_at,
+      cfp_array1f_cfp_ptr1f_lt,
+      cfp_array1f_cfp_ptr1f_gt,
+      cfp_array1f_cfp_ptr1f_leq,
+      cfp_array1f_cfp_ptr1f_geq,
+      cfp_array1f_cfp_ptr1f_eq,
+      cfp_array1f_cfp_ptr1f_neq,
+      cfp_array1f_cfp_ptr1f_distance,
+      cfp_array1f_cfp_ptr1f_next,
+      cfp_array1f_cfp_ptr1f_prev,
+      cfp_array1f_cfp_ptr1f_inc,
+      cfp_array1f_cfp_ptr1f_dec,
+    },
+
+    {
+      cfp_array1f_cfp_iter1f_get,
+      cfp_array1f_cfp_iter1f_get_at,
+      cfp_array1f_cfp_iter1f_set,
+      cfp_array1f_cfp_iter1f_set_at,
+      cfp_array1f_cfp_iter1f_ref,
+      cfp_array1f_cfp_iter1f_ref_at,
+      cfp_array1f_cfp_iter1f_ptr,
+      cfp_array1f_cfp_iter1f_ptr_at,
+      cfp_array1f_cfp_iter1f_i,
+      cfp_array1f_cfp_iter1f_lt,
+      cfp_array1f_cfp_iter1f_gt,
+      cfp_array1f_cfp_iter1f_leq,
+      cfp_array1f_cfp_iter1f_geq,
+      cfp_array1f_cfp_iter1f_eq,
+      cfp_array1f_cfp_iter1f_neq,
+      cfp_array1f_cfp_iter1f_distance,
+      cfp_array1f_cfp_iter1f_next,
+      cfp_array1f_cfp_iter1f_prev,
+      cfp_array1f_cfp_iter1f_inc,
+      cfp_array1f_cfp_iter1f_dec,
+    },
+
+    {
+      cfp_header_ctor_array1f,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array1d
+  {
+    cfp_array1d_ctor_default,
+    cfp_array1d_ctor,
+    cfp_array1d_ctor_copy,
+    cfp_array1d_ctor_header,
+    cfp_array1d_dtor,
+
+    cfp_array1d_deep_copy,
+
+    cfp_array1d_rate,
+    cfp_array1d_set_rate,
+    cfp_array1d_cache_size,
+    cfp_array1d_set_cache_size,
+    cfp_array1d_clear_cache,
+    cfp_array1d_flush_cache,
+    cfp_array1d_size_bytes,
+    cfp_array1d_compressed_size,
+    cfp_array1d_compressed_data,
+    cfp_array1d_size,
+    cfp_array1d_resize,
+
+    cfp_array1d_get_array,
+    cfp_array1d_set_array,
+    cfp_array1d_get_flat,
+    cfp_array1d_set_flat,
+    cfp_array1d_get,
+    cfp_array1d_set,
+
+    cfp_array1d_ref,
+    cfp_array1d_ref_flat,
+
+    cfp_array1d_ptr,
+    cfp_array1d_ptr_flat,
+
+    cfp_array1d_begin,
+    cfp_array1d_end,
+
+    {
+      cfp_array1d_cfp_ref1d_get,
+      cfp_array1d_cfp_ref1d_set,
+      cfp_array1d_cfp_ref1d_ptr,
+      cfp_array1d_cfp_ref1d_copy,
+    },
+
+    {
+      cfp_array1d_cfp_ptr1d_get,
+      cfp_array1d_cfp_ptr1d_get_at,
+      cfp_array1d_cfp_ptr1d_set,
+      cfp_array1d_cfp_ptr1d_set_at,
+      cfp_array1d_cfp_ptr1d_ref,
+      cfp_array1d_cfp_ptr1d_ref_at,
+      cfp_array1d_cfp_ptr1d_lt,
+      cfp_array1d_cfp_ptr1d_gt,
+      cfp_array1d_cfp_ptr1d_leq,
+      cfp_array1d_cfp_ptr1d_geq,
+      cfp_array1d_cfp_ptr1d_eq,
+      cfp_array1d_cfp_ptr1d_neq,
+      cfp_array1d_cfp_ptr1d_distance,
+      cfp_array1d_cfp_ptr1d_next,
+      cfp_array1d_cfp_ptr1d_prev,
+      cfp_array1d_cfp_ptr1d_inc,
+      cfp_array1d_cfp_ptr1d_dec,
+    },
+
+    {
+      cfp_array1d_cfp_iter1d_get,
+      cfp_array1d_cfp_iter1d_get_at,
+      cfp_array1d_cfp_iter1d_set,
+      cfp_array1d_cfp_iter1d_set_at,
+      cfp_array1d_cfp_iter1d_ref,
+      cfp_array1d_cfp_iter1d_ref_at,
+      cfp_array1d_cfp_iter1d_ptr,
+      cfp_array1d_cfp_iter1d_ptr_at,
+      cfp_array1d_cfp_iter1d_i,
+      cfp_array1d_cfp_iter1d_lt,
+      cfp_array1d_cfp_iter1d_gt,
+      cfp_array1d_cfp_iter1d_leq,
+      cfp_array1d_cfp_iter1d_geq,
+      cfp_array1d_cfp_iter1d_eq,
+      cfp_array1d_cfp_iter1d_neq,
+      cfp_array1d_cfp_iter1d_distance,
+      cfp_array1d_cfp_iter1d_next,
+      cfp_array1d_cfp_iter1d_prev,
+      cfp_array1d_cfp_iter1d_inc,
+      cfp_array1d_cfp_iter1d_dec,
+    },
+
+    {
+      cfp_header_ctor_array1d,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array2f
+  {
+    cfp_array2f_ctor_default,
+    cfp_array2f_ctor,
+    cfp_array2f_ctor_copy,
+    cfp_array2f_ctor_header,
+    cfp_array2f_dtor,
+
+    cfp_array2f_deep_copy,
+
+    cfp_array2f_rate,
+    cfp_array2f_set_rate,
+    cfp_array2f_cache_size,
+    cfp_array2f_set_cache_size,
+    cfp_array2f_clear_cache,
+    cfp_array2f_flush_cache,
+    cfp_array2f_size_bytes,
+    cfp_array2f_compressed_size,
+    cfp_array2f_compressed_data,
+    cfp_array2f_size,
+    cfp_array2f_size_x,
+    cfp_array2f_size_y,
+    cfp_array2f_resize,
+
+    cfp_array2f_get_array,
+    cfp_array2f_set_array,
+    cfp_array2f_get_flat,
+    cfp_array2f_set_flat,
+    cfp_array2f_get,
+    cfp_array2f_set,
+
+    cfp_array2f_ref,
+    cfp_array2f_ref_flat,
+
+    cfp_array2f_ptr,
+    cfp_array2f_ptr_flat,
+
+    cfp_array2f_begin,
+    cfp_array2f_end,
+
+    {
+      cfp_array2f_cfp_ref2f_get,
+      cfp_array2f_cfp_ref2f_set,
+      cfp_array2f_cfp_ref2f_ptr,
+      cfp_array2f_cfp_ref2f_copy,
+    },
+
+    {
+      cfp_array2f_cfp_ptr2f_get,
+      cfp_array2f_cfp_ptr2f_get_at,
+      cfp_array2f_cfp_ptr2f_set,
+      cfp_array2f_cfp_ptr2f_set_at,
+      cfp_array2f_cfp_ptr2f_ref,
+      cfp_array2f_cfp_ptr2f_ref_at,
+      cfp_array2f_cfp_ptr2f_lt,
+      cfp_array2f_cfp_ptr2f_gt,
+      cfp_array2f_cfp_ptr2f_leq,
+      cfp_array2f_cfp_ptr2f_geq,
+      cfp_array2f_cfp_ptr2f_eq,
+      cfp_array2f_cfp_ptr2f_neq,
+      cfp_array2f_cfp_ptr2f_distance,
+      cfp_array2f_cfp_ptr2f_next,
+      cfp_array2f_cfp_ptr2f_prev,
+      cfp_array2f_cfp_ptr2f_inc,
+      cfp_array2f_cfp_ptr2f_dec,
+    },
+
+    {
+      cfp_array2f_cfp_iter2f_get,
+      cfp_array2f_cfp_iter2f_get_at,
+      cfp_array2f_cfp_iter2f_set,
+      cfp_array2f_cfp_iter2f_set_at,
+      cfp_array2f_cfp_iter2f_ref,
+      cfp_array2f_cfp_iter2f_ref_at,
+      cfp_array2f_cfp_iter2f_ptr,
+      cfp_array2f_cfp_iter2f_ptr_at,
+      cfp_array2f_cfp_iter2f_i,
+      cfp_array2f_cfp_iter2f_j,
+      cfp_array2f_cfp_iter2f_lt,
+      cfp_array2f_cfp_iter2f_gt,
+      cfp_array2f_cfp_iter2f_leq,
+      cfp_array2f_cfp_iter2f_geq,
+      cfp_array2f_cfp_iter2f_eq,
+      cfp_array2f_cfp_iter2f_neq,
+      cfp_array2f_cfp_iter2f_distance,
+      cfp_array2f_cfp_iter2f_next,
+      cfp_array2f_cfp_iter2f_prev,
+      cfp_array2f_cfp_iter2f_inc,
+      cfp_array2f_cfp_iter2f_dec,
+    },
+
+    {
+      cfp_header_ctor_array2f,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array2d
+  {
+    cfp_array2d_ctor_default,
+    cfp_array2d_ctor,
+    cfp_array2d_ctor_copy,
+    cfp_array2d_ctor_header,
+    cfp_array2d_dtor,
+
+    cfp_array2d_deep_copy,
+
+    cfp_array2d_rate,
+    cfp_array2d_set_rate,
+    cfp_array2d_cache_size,
+    cfp_array2d_set_cache_size,
+    cfp_array2d_clear_cache,
+    cfp_array2d_flush_cache,
+    cfp_array2d_size_bytes,
+    cfp_array2d_compressed_size,
+    cfp_array2d_compressed_data,
+    cfp_array2d_size,
+    cfp_array2d_size_x,
+    cfp_array2d_size_y,
+    cfp_array2d_resize,
+
+    cfp_array2d_get_array,
+    cfp_array2d_set_array,
+    cfp_array2d_get_flat,
+    cfp_array2d_set_flat,
+    cfp_array2d_get,
+    cfp_array2d_set,
+
+    cfp_array2d_ref,
+    cfp_array2d_ref_flat,
+
+    cfp_array2d_ptr,
+    cfp_array2d_ptr_flat,
+
+    cfp_array2d_begin,
+    cfp_array2d_end,
+
+    {
+      cfp_array2d_cfp_ref2d_get,
+      cfp_array2d_cfp_ref2d_set,
+      cfp_array2d_cfp_ref2d_ptr,
+      cfp_array2d_cfp_ref2d_copy,
+    },
+
+    {
+      cfp_array2d_cfp_ptr2d_get,
+      cfp_array2d_cfp_ptr2d_get_at,
+      cfp_array2d_cfp_ptr2d_set,
+      cfp_array2d_cfp_ptr2d_set_at,
+      cfp_array2d_cfp_ptr2d_ref,
+      cfp_array2d_cfp_ptr2d_ref_at,
+      cfp_array2d_cfp_ptr2d_lt,
+      cfp_array2d_cfp_ptr2d_gt,
+      cfp_array2d_cfp_ptr2d_leq,
+      cfp_array2d_cfp_ptr2d_geq,
+      cfp_array2d_cfp_ptr2d_eq,
+      cfp_array2d_cfp_ptr2d_neq,
+      cfp_array2d_cfp_ptr2d_distance,
+      cfp_array2d_cfp_ptr2d_next,
+      cfp_array2d_cfp_ptr2d_prev,
+      cfp_array2d_cfp_ptr2d_inc,
+      cfp_array2d_cfp_ptr2d_dec,
+    },
+
+    {
+      cfp_array2d_cfp_iter2d_get,
+      cfp_array2d_cfp_iter2d_get_at,
+      cfp_array2d_cfp_iter2d_set,
+      cfp_array2d_cfp_iter2d_set_at,
+      cfp_array2d_cfp_iter2d_ref,
+      cfp_array2d_cfp_iter2d_ref_at,
+      cfp_array2d_cfp_iter2d_ptr,
+      cfp_array2d_cfp_iter2d_ptr_at,
+      cfp_array2d_cfp_iter2d_i,
+      cfp_array2d_cfp_iter2d_j,
+      cfp_array2d_cfp_iter2d_lt,
+      cfp_array2d_cfp_iter2d_gt,
+      cfp_array2d_cfp_iter2d_leq,
+      cfp_array2d_cfp_iter2d_geq,
+      cfp_array2d_cfp_iter2d_eq,
+      cfp_array2d_cfp_iter2d_neq,
+      cfp_array2d_cfp_iter2d_distance,
+      cfp_array2d_cfp_iter2d_next,
+      cfp_array2d_cfp_iter2d_prev,
+      cfp_array2d_cfp_iter2d_inc,
+      cfp_array2d_cfp_iter2d_dec,
+    },
+
+    {
+      cfp_header_ctor_array2d,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array3f
+  {
+    cfp_array3f_ctor_default,
+    cfp_array3f_ctor,
+    cfp_array3f_ctor_copy,
+    cfp_array3f_ctor_header,
+    cfp_array3f_dtor,
+
+    cfp_array3f_deep_copy,
+
+    cfp_array3f_rate,
+    cfp_array3f_set_rate,
+    cfp_array3f_cache_size,
+    cfp_array3f_set_cache_size,
+    cfp_array3f_clear_cache,
+    cfp_array3f_flush_cache,
+    cfp_array3f_size_bytes,
+    cfp_array3f_compressed_size,
+    cfp_array3f_compressed_data,
+    cfp_array3f_size,
+    cfp_array3f_size_x,
+    cfp_array3f_size_y,
+    cfp_array3f_size_z,
+    cfp_array3f_resize,
+
+    cfp_array3f_get_array,
+    cfp_array3f_set_array,
+    cfp_array3f_get_flat,
+    cfp_array3f_set_flat,
+    cfp_array3f_get,
+    cfp_array3f_set,
+
+    cfp_array3f_ref,
+    cfp_array3f_ref_flat,
+
+    cfp_array3f_ptr,
+    cfp_array3f_ptr_flat,
+
+    cfp_array3f_begin,
+    cfp_array3f_end,
+
+    {
+      cfp_array3f_cfp_ref3f_get,
+      cfp_array3f_cfp_ref3f_set,
+      cfp_array3f_cfp_ref3f_ptr,
+      cfp_array3f_cfp_ref3f_copy,
+    },
+
+    {
+      cfp_array3f_cfp_ptr3f_get,
+      cfp_array3f_cfp_ptr3f_get_at,
+      cfp_array3f_cfp_ptr3f_set,
+      cfp_array3f_cfp_ptr3f_set_at,
+      cfp_array3f_cfp_ptr3f_ref,
+      cfp_array3f_cfp_ptr3f_ref_at,
+      cfp_array3f_cfp_ptr3f_lt,
+      cfp_array3f_cfp_ptr3f_gt,
+      cfp_array3f_cfp_ptr3f_leq,
+      cfp_array3f_cfp_ptr3f_geq,
+      cfp_array3f_cfp_ptr3f_eq,
+      cfp_array3f_cfp_ptr3f_neq,
+      cfp_array3f_cfp_ptr3f_distance,
+      cfp_array3f_cfp_ptr3f_next,
+      cfp_array3f_cfp_ptr3f_prev,
+      cfp_array3f_cfp_ptr3f_inc,
+      cfp_array3f_cfp_ptr3f_dec,
+    },
+
+    {
+      cfp_array3f_cfp_iter3f_get,
+      cfp_array3f_cfp_iter3f_get_at,
+      cfp_array3f_cfp_iter3f_set,
+      cfp_array3f_cfp_iter3f_set_at,
+      cfp_array3f_cfp_iter3f_ref,
+      cfp_array3f_cfp_iter3f_ref_at,
+      cfp_array3f_cfp_iter3f_ptr,
+      cfp_array3f_cfp_iter3f_ptr_at,
+      cfp_array3f_cfp_iter3f_i,
+      cfp_array3f_cfp_iter3f_j,
+      cfp_array3f_cfp_iter3f_k,
+      cfp_array3f_cfp_iter3f_lt,
+      cfp_array3f_cfp_iter3f_gt,
+      cfp_array3f_cfp_iter3f_leq,
+      cfp_array3f_cfp_iter3f_geq,
+      cfp_array3f_cfp_iter3f_eq,
+      cfp_array3f_cfp_iter3f_neq,
+      cfp_array3f_cfp_iter3f_distance,
+      cfp_array3f_cfp_iter3f_next,
+      cfp_array3f_cfp_iter3f_prev,
+      cfp_array3f_cfp_iter3f_inc,
+      cfp_array3f_cfp_iter3f_dec,
+    },
+
+    {
+      cfp_header_ctor_array3f,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array3d
+  {
+    cfp_array3d_ctor_default,
+    cfp_array3d_ctor,
+    cfp_array3d_ctor_copy,
+    cfp_array3d_ctor_header,
+    cfp_array3d_dtor,
+
+    cfp_array3d_deep_copy,
+
+    cfp_array3d_rate,
+    cfp_array3d_set_rate,
+    cfp_array3d_cache_size,
+    cfp_array3d_set_cache_size,
+    cfp_array3d_clear_cache,
+    cfp_array3d_flush_cache,
+    cfp_array3d_size_bytes,
+    cfp_array3d_compressed_size,
+    cfp_array3d_compressed_data,
+    cfp_array3d_size,
+    cfp_array3d_size_x,
+    cfp_array3d_size_y,
+    cfp_array3d_size_z,
+    cfp_array3d_resize,
+
+    cfp_array3d_get_array,
+    cfp_array3d_set_array,
+    cfp_array3d_get_flat,
+    cfp_array3d_set_flat,
+    cfp_array3d_get,
+    cfp_array3d_set,
+
+    cfp_array3d_ref,
+    cfp_array3d_ref_flat,
+
+    cfp_array3d_ptr,
+    cfp_array3d_ptr_flat,
+
+    cfp_array3d_begin,
+    cfp_array3d_end,
+
+    {
+      cfp_array3d_cfp_ref3d_get,
+      cfp_array3d_cfp_ref3d_set,
+      cfp_array3d_cfp_ref3d_ptr,
+      cfp_array3d_cfp_ref3d_copy,
+    },
+
+    {
+      cfp_array3d_cfp_ptr3d_get,
+      cfp_array3d_cfp_ptr3d_get_at,
+      cfp_array3d_cfp_ptr3d_set,
+      cfp_array3d_cfp_ptr3d_set_at,
+      cfp_array3d_cfp_ptr3d_ref,
+      cfp_array3d_cfp_ptr3d_ref_at,
+      cfp_array3d_cfp_ptr3d_lt,
+      cfp_array3d_cfp_ptr3d_gt,
+      cfp_array3d_cfp_ptr3d_leq,
+      cfp_array3d_cfp_ptr3d_geq,
+      cfp_array3d_cfp_ptr3d_eq,
+      cfp_array3d_cfp_ptr3d_neq,
+      cfp_array3d_cfp_ptr3d_distance,
+      cfp_array3d_cfp_ptr3d_next,
+      cfp_array3d_cfp_ptr3d_prev,
+      cfp_array3d_cfp_ptr3d_inc,
+      cfp_array3d_cfp_ptr3d_dec,
+    },
+
+    {
+      cfp_array3d_cfp_iter3d_get,
+      cfp_array3d_cfp_iter3d_get_at,
+      cfp_array3d_cfp_iter3d_set,
+      cfp_array3d_cfp_iter3d_set_at,
+      cfp_array3d_cfp_iter3d_ref,
+      cfp_array3d_cfp_iter3d_ref_at,
+      cfp_array3d_cfp_iter3d_ptr,
+      cfp_array3d_cfp_iter3d_ptr_at,
+      cfp_array3d_cfp_iter3d_i,
+      cfp_array3d_cfp_iter3d_j,
+      cfp_array3d_cfp_iter3d_k,
+      cfp_array3d_cfp_iter3d_lt,
+      cfp_array3d_cfp_iter3d_gt,
+      cfp_array3d_cfp_iter3d_leq,
+      cfp_array3d_cfp_iter3d_geq,
+      cfp_array3d_cfp_iter3d_eq,
+      cfp_array3d_cfp_iter3d_neq,
+      cfp_array3d_cfp_iter3d_distance,
+      cfp_array3d_cfp_iter3d_next,
+      cfp_array3d_cfp_iter3d_prev,
+      cfp_array3d_cfp_iter3d_inc,
+      cfp_array3d_cfp_iter3d_dec,
+    },
+
+    {
+      cfp_header_ctor_array3d,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array4f
+  {
+    cfp_array4f_ctor_default,
+    cfp_array4f_ctor,
+    cfp_array4f_ctor_copy,
+    cfp_array4f_ctor_header,
+    cfp_array4f_dtor,
+
+    cfp_array4f_deep_copy,
+
+    cfp_array4f_rate,
+    cfp_array4f_set_rate,
+    cfp_array4f_cache_size,
+    cfp_array4f_set_cache_size,
+    cfp_array4f_clear_cache,
+    cfp_array4f_flush_cache,
+    cfp_array4f_size_bytes,
+    cfp_array4f_compressed_size,
+    cfp_array4f_compressed_data,
+    cfp_array4f_size,
+    cfp_array4f_size_x,
+    cfp_array4f_size_y,
+    cfp_array4f_size_z,
+    cfp_array4f_size_w,
+    cfp_array4f_resize,
+
+    cfp_array4f_get_array,
+    cfp_array4f_set_array,
+    cfp_array4f_get_flat,
+    cfp_array4f_set_flat,
+    cfp_array4f_get,
+    cfp_array4f_set,
+
+    cfp_array4f_ref,
+    cfp_array4f_ref_flat,
+
+    cfp_array4f_ptr,
+    cfp_array4f_ptr_flat,
+
+    cfp_array4f_begin,
+    cfp_array4f_end,
+
+    {
+      cfp_array4f_cfp_ref4f_get,
+      cfp_array4f_cfp_ref4f_set,
+      cfp_array4f_cfp_ref4f_ptr,
+      cfp_array4f_cfp_ref4f_copy,
+    },
+
+    {
+      cfp_array4f_cfp_ptr4f_get,
+      cfp_array4f_cfp_ptr4f_get_at,
+      cfp_array4f_cfp_ptr4f_set,
+      cfp_array4f_cfp_ptr4f_set_at,
+      cfp_array4f_cfp_ptr4f_ref,
+      cfp_array4f_cfp_ptr4f_ref_at,
+      cfp_array4f_cfp_ptr4f_lt,
+      cfp_array4f_cfp_ptr4f_gt,
+      cfp_array4f_cfp_ptr4f_leq,
+      cfp_array4f_cfp_ptr4f_geq,
+      cfp_array4f_cfp_ptr4f_eq,
+      cfp_array4f_cfp_ptr4f_neq,
+      cfp_array4f_cfp_ptr4f_distance,
+      cfp_array4f_cfp_ptr4f_next,
+      cfp_array4f_cfp_ptr4f_prev,
+      cfp_array4f_cfp_ptr4f_inc,
+      cfp_array4f_cfp_ptr4f_dec,
+    },
+
+    {
+      cfp_array4f_cfp_iter4f_get,
+      cfp_array4f_cfp_iter4f_get_at,
+      cfp_array4f_cfp_iter4f_set,
+      cfp_array4f_cfp_iter4f_set_at,
+      cfp_array4f_cfp_iter4f_ref,
+      cfp_array4f_cfp_iter4f_ref_at,
+      cfp_array4f_cfp_iter4f_ptr,
+      cfp_array4f_cfp_iter4f_ptr_at,
+      cfp_array4f_cfp_iter4f_i,
+      cfp_array4f_cfp_iter4f_j,
+      cfp_array4f_cfp_iter4f_k,
+      cfp_array4f_cfp_iter4f_l,
+      cfp_array4f_cfp_iter4f_lt,
+      cfp_array4f_cfp_iter4f_gt,
+      cfp_array4f_cfp_iter4f_leq,
+      cfp_array4f_cfp_iter4f_geq,
+      cfp_array4f_cfp_iter4f_eq,
+      cfp_array4f_cfp_iter4f_neq,
+      cfp_array4f_cfp_iter4f_distance,
+      cfp_array4f_cfp_iter4f_next,
+      cfp_array4f_cfp_iter4f_prev,
+      cfp_array4f_cfp_iter4f_inc,
+      cfp_array4f_cfp_iter4f_dec,
+    },
+
+    {
+      cfp_header_ctor_array4f,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+  // array4d
+  {
+    cfp_array4d_ctor_default,
+    cfp_array4d_ctor,
+    cfp_array4d_ctor_copy,
+    cfp_array4d_ctor_header,
+    cfp_array4d_dtor,
+
+    cfp_array4d_deep_copy,
+
+    cfp_array4d_rate,
+    cfp_array4d_set_rate,
+    cfp_array4d_cache_size,
+    cfp_array4d_set_cache_size,
+    cfp_array4d_clear_cache,
+    cfp_array4d_flush_cache,
+    cfp_array4d_size_bytes,
+    cfp_array4d_compressed_size,
+    cfp_array4d_compressed_data,
+    cfp_array4d_size,
+    cfp_array4d_size_x,
+    cfp_array4d_size_y,
+    cfp_array4d_size_z,
+    cfp_array4d_size_w,
+    cfp_array4d_resize,
+
+    cfp_array4d_get_array,
+    cfp_array4d_set_array,
+    cfp_array4d_get_flat,
+    cfp_array4d_set_flat,
+    cfp_array4d_get,
+    cfp_array4d_set,
+
+    cfp_array4d_ref,
+    cfp_array4d_ref_flat,
+
+    cfp_array4d_ptr,
+    cfp_array4d_ptr_flat,
+
+    cfp_array4d_begin,
+    cfp_array4d_end,
+
+    {
+      cfp_array4d_cfp_ref4d_get,
+      cfp_array4d_cfp_ref4d_set,
+      cfp_array4d_cfp_ref4d_ptr,
+      cfp_array4d_cfp_ref4d_copy,
+    },
+
+    {
+      cfp_array4d_cfp_ptr4d_get,
+      cfp_array4d_cfp_ptr4d_get_at,
+      cfp_array4d_cfp_ptr4d_set,
+      cfp_array4d_cfp_ptr4d_set_at,
+      cfp_array4d_cfp_ptr4d_ref,
+      cfp_array4d_cfp_ptr4d_ref_at,
+      cfp_array4d_cfp_ptr4d_lt,
+      cfp_array4d_cfp_ptr4d_gt,
+      cfp_array4d_cfp_ptr4d_leq,
+      cfp_array4d_cfp_ptr4d_geq,
+      cfp_array4d_cfp_ptr4d_eq,
+      cfp_array4d_cfp_ptr4d_neq,
+      cfp_array4d_cfp_ptr4d_distance,
+      cfp_array4d_cfp_ptr4d_next,
+      cfp_array4d_cfp_ptr4d_prev,
+      cfp_array4d_cfp_ptr4d_inc,
+      cfp_array4d_cfp_ptr4d_dec,
+    },
+
+    {
+      cfp_array4d_cfp_iter4d_get,
+      cfp_array4d_cfp_iter4d_get_at,
+      cfp_array4d_cfp_iter4d_set,
+      cfp_array4d_cfp_iter4d_set_at,
+      cfp_array4d_cfp_iter4d_ref,
+      cfp_array4d_cfp_iter4d_ref_at,
+      cfp_array4d_cfp_iter4d_ptr,
+      cfp_array4d_cfp_iter4d_ptr_at,
+      cfp_array4d_cfp_iter4d_i,
+      cfp_array4d_cfp_iter4d_j,
+      cfp_array4d_cfp_iter4d_k,
+      cfp_array4d_cfp_iter4d_l,
+      cfp_array4d_cfp_iter4d_lt,
+      cfp_array4d_cfp_iter4d_gt,
+      cfp_array4d_cfp_iter4d_leq,
+      cfp_array4d_cfp_iter4d_geq,
+      cfp_array4d_cfp_iter4d_eq,
+      cfp_array4d_cfp_iter4d_neq,
+      cfp_array4d_cfp_iter4d_distance,
+      cfp_array4d_cfp_iter4d_next,
+      cfp_array4d_cfp_iter4d_prev,
+      cfp_array4d_cfp_iter4d_inc,
+      cfp_array4d_cfp_iter4d_dec,
+    },
+
+    {
+      cfp_header_ctor_array4d,
+      cfp_header_ctor_buffer,
+      cfp_header_dtor,
+      cfp_header_scalar_type,
+      cfp_header_dimensionality,
+      cfp_header_size_x,
+      cfp_header_size_y,
+      cfp_header_size_z,
+      cfp_header_size_w,
+      cfp_header_rate,
+      cfp_header_data,
+      cfp_header_size_bytes,
+    },
+  },
+};
diff --git a/cfp/cfparray1d.cpp b/cfp/cfparray1d.cpp
new file mode 100644
index 00000000..3a76b65f
--- /dev/null
+++ b/cfp/cfparray1d.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array1d.h"
+#include "zfp/array1.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array1d
+#define CFP_REF_TYPE cfp_ref1d
+#define CFP_PTR_TYPE cfp_ptr1d
+#define CFP_ITER_TYPE cfp_iter1d
+#define ZFP_ARRAY_TYPE zfp::array1d
+#define ZFP_SCALAR_TYPE double
+
+#include "template/cfparray.cpp"
+#include "template/cfparray1.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray1f.cpp b/cfp/cfparray1f.cpp
new file mode 100644
index 00000000..2df70530
--- /dev/null
+++ b/cfp/cfparray1f.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array1f.h"
+#include "zfp/array1.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array1f
+#define CFP_REF_TYPE cfp_ref1f
+#define CFP_PTR_TYPE cfp_ptr1f
+#define CFP_ITER_TYPE cfp_iter1f
+#define ZFP_ARRAY_TYPE zfp::array1f
+#define ZFP_SCALAR_TYPE float
+
+#include "template/cfparray.cpp"
+#include "template/cfparray1.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray2d.cpp b/cfp/cfparray2d.cpp
new file mode 100644
index 00000000..fa3051b1
--- /dev/null
+++ b/cfp/cfparray2d.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array2d.h"
+#include "zfp/array2.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array2d
+#define CFP_REF_TYPE cfp_ref2d
+#define CFP_PTR_TYPE cfp_ptr2d
+#define CFP_ITER_TYPE cfp_iter2d
+#define ZFP_ARRAY_TYPE zfp::array2d
+#define ZFP_SCALAR_TYPE double
+
+#include "template/cfparray.cpp"
+#include "template/cfparray2.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray2f.cpp b/cfp/cfparray2f.cpp
new file mode 100644
index 00000000..ebfd1d9d
--- /dev/null
+++ b/cfp/cfparray2f.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array2f.h"
+#include "zfp/array2.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array2f
+#define CFP_REF_TYPE cfp_ref2f
+#define CFP_PTR_TYPE cfp_ptr2f
+#define CFP_ITER_TYPE cfp_iter2f
+#define ZFP_ARRAY_TYPE zfp::array2f
+#define ZFP_SCALAR_TYPE float
+
+#include "template/cfparray.cpp"
+#include "template/cfparray2.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray3d.cpp b/cfp/cfparray3d.cpp
new file mode 100644
index 00000000..100d639a
--- /dev/null
+++ b/cfp/cfparray3d.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array3d.h"
+#include "zfp/array3.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array3d
+#define CFP_REF_TYPE cfp_ref3d
+#define CFP_PTR_TYPE cfp_ptr3d
+#define CFP_ITER_TYPE cfp_iter3d
+#define ZFP_ARRAY_TYPE zfp::array3d
+#define ZFP_SCALAR_TYPE double
+
+#include "template/cfparray.cpp"
+#include "template/cfparray3.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray3f.cpp b/cfp/cfparray3f.cpp
new file mode 100644
index 00000000..b5cafb71
--- /dev/null
+++ b/cfp/cfparray3f.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array3f.h"
+#include "zfp/array3.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array3f
+#define CFP_REF_TYPE cfp_ref3f
+#define CFP_PTR_TYPE cfp_ptr3f
+#define CFP_ITER_TYPE cfp_iter3f
+#define ZFP_ARRAY_TYPE zfp::array3f
+#define ZFP_SCALAR_TYPE float
+
+#include "template/cfparray.cpp"
+#include "template/cfparray3.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray4d.cpp b/cfp/cfparray4d.cpp
new file mode 100644
index 00000000..bf1a2b06
--- /dev/null
+++ b/cfp/cfparray4d.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array4d.h"
+#include "zfp/array4.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array4d
+#define CFP_REF_TYPE cfp_ref4d
+#define CFP_PTR_TYPE cfp_ptr4d
+#define CFP_ITER_TYPE cfp_iter4d
+#define ZFP_ARRAY_TYPE zfp::array4d
+#define ZFP_SCALAR_TYPE double
+
+#include "template/cfparray.cpp"
+#include "template/cfparray4.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfparray4f.cpp b/cfp/cfparray4f.cpp
new file mode 100644
index 00000000..ca6bf0dd
--- /dev/null
+++ b/cfp/cfparray4f.cpp
@@ -0,0 +1,21 @@
+#include "zfp/internal/cfp/array4f.h"
+#include "zfp/array4.hpp"
+
+#include "template/template.h"
+
+#define CFP_ARRAY_TYPE cfp_array4f
+#define CFP_REF_TYPE cfp_ref4f
+#define CFP_PTR_TYPE cfp_ptr4f
+#define CFP_ITER_TYPE cfp_iter4f
+#define ZFP_ARRAY_TYPE zfp::array4f
+#define ZFP_SCALAR_TYPE float
+
+#include "template/cfparray.cpp"
+#include "template/cfparray4.cpp"
+
+#undef CFP_ARRAY_TYPE
+#undef CFP_REF_TYPE
+#undef CFP_PTR_TYPE
+#undef CFP_ITER_TYPE
+#undef ZFP_ARRAY_TYPE
+#undef ZFP_SCALAR_TYPE
diff --git a/cfp/cfpheader.cpp b/cfp/cfpheader.cpp
new file mode 100644
index 00000000..b4b66e09
--- /dev/null
+++ b/cfp/cfpheader.cpp
@@ -0,0 +1,21 @@
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
+#include "zfp/internal/codec/zfpheader.hpp"
+#include "zfp/internal/cfp/header.h"
+#include "zfp/internal/cfp/array1f.h"
+#include "zfp/internal/cfp/array1d.h"
+#include "zfp/internal/cfp/array2f.h"
+#include "zfp/internal/cfp/array2d.h"
+#include "zfp/internal/cfp/array3f.h"
+#include "zfp/internal/cfp/array3d.h"
+#include "zfp/internal/cfp/array4f.h"
+#include "zfp/internal/cfp/array4d.h"
+
+#include "template/template.h"
+
+#define CFP_HEADER_TYPE cfp_header
+#define ZFP_HEADER_TYPE zfp::array::header
+
+#include "template/cfpheader.cpp"
diff --git a/cfp/include/cfparray1d.h b/cfp/include/cfparray1d.h
deleted file mode 100644
index 1be27295..00000000
--- a/cfp/include/cfparray1d.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef CFP_ARRAY_1D
-#define CFP_ARRAY_1D
-
-#include <stddef.h>
-#include "zfp/types.h"
-
-struct cfp_array1d;
-typedef struct cfp_array1d cfp_array1d;
-
-typedef struct {
-  cfp_array1d* (*ctor_default)();
-  cfp_array1d* (*ctor)(uint n, double rate, const double* p, size_t csize);
-  cfp_array1d* (*ctor_copy)(const cfp_array1d* src);
-  void (*dtor)(cfp_array1d* self);
-
-  void (*deep_copy)(cfp_array1d* self, const cfp_array1d* src);
-
-  double (*rate)(const cfp_array1d* self);
-  double (*set_rate)(cfp_array1d* self, double rate);
-  size_t (*cache_size)(const cfp_array1d* self);
-  void (*set_cache_size)(cfp_array1d* self, size_t csize);
-  void (*clear_cache)(const cfp_array1d* self);
-  void (*flush_cache)(const cfp_array1d* self);
-  size_t (*compressed_size)(const cfp_array1d* self);
-  uchar* (*compressed_data)(const cfp_array1d* self);
-  size_t (*size)(const cfp_array1d* self);
-  void (*resize)(cfp_array1d* self, uint n, int clear);
-
-  void (*get_array)(const cfp_array1d* self, double* p);
-  void (*set_array)(cfp_array1d* self, const double* p);
-  double (*get_flat)(const cfp_array1d* self, uint i);
-  void (*set_flat)(cfp_array1d* self, uint i, double val);
-  double (*get)(const cfp_array1d* self, uint i);
-  void (*set)(cfp_array1d* self, uint i, double val);
-} cfp_array1d_api;
-
-#endif
diff --git a/cfp/include/cfparray1f.h b/cfp/include/cfparray1f.h
deleted file mode 100644
index 90d52391..00000000
--- a/cfp/include/cfparray1f.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef CFP_ARRAY_1F
-#define CFP_ARRAY_1F
-
-#include <stddef.h>
-#include "zfp/types.h"
-
-struct cfp_array1f;
-typedef struct cfp_array1f cfp_array1f;
-
-typedef struct {
-  cfp_array1f* (*ctor_default)();
-  cfp_array1f* (*ctor)(uint n, double rate, const float* p, size_t csize);
-  cfp_array1f* (*ctor_copy)(const cfp_array1f* src);
-  void (*dtor)(cfp_array1f* self);
-
-  void (*deep_copy)(cfp_array1f* self, const cfp_array1f* src);
-
-  double (*rate)(const cfp_array1f* self);
-  double (*set_rate)(cfp_array1f* self, double rate);
-  size_t (*cache_size)(const cfp_array1f* self);
-  void (*set_cache_size)(cfp_array1f* self, size_t csize);
-  void (*clear_cache)(const cfp_array1f* self);
-  void (*flush_cache)(const cfp_array1f* self);
-  size_t (*compressed_size)(const cfp_array1f* self);
-  uchar* (*compressed_data)(const cfp_array1f* self);
-  size_t (*size)(const cfp_array1f* self);
-  void (*resize)(cfp_array1f* self, uint n, int clear);
-
-  void (*get_array)(const cfp_array1f* self, float* p);
-  void (*set_array)(cfp_array1f* self, const float* p);
-  float (*get_flat)(const cfp_array1f* self, uint i);
-  void (*set_flat)(cfp_array1f* self, uint i, float val);
-  float (*get)(const cfp_array1f* self, uint i);
-  void (*set)(cfp_array1f* self, uint i, float val);
-} cfp_array1f_api;
-
-#endif
diff --git a/cfp/include/cfparray2d.h b/cfp/include/cfparray2d.h
deleted file mode 100644
index b8d4c2a8..00000000
--- a/cfp/include/cfparray2d.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef CFP_ARRAY_2D
-#define CFP_ARRAY_2D
-
-#include <stddef.h>
-#include "zfp/types.h"
-
-struct cfp_array2d;
-typedef struct cfp_array2d cfp_array2d;
-
-typedef struct {
-  cfp_array2d* (*ctor_default)();
-  cfp_array2d* (*ctor)(uint nx, uint ny, double rate, const double* p, size_t csize);
-  cfp_array2d* (*ctor_copy)(const cfp_array2d* src);
-  void (*dtor)(cfp_array2d* self);
-
-  void (*deep_copy)(cfp_array2d* self, const cfp_array2d* src);
-
-  double (*rate)(const cfp_array2d* self);
-  double (*set_rate)(cfp_array2d* self, double rate);
-  size_t (*cache_size)(const cfp_array2d* self);
-  void (*set_cache_size)(cfp_array2d* self, size_t csize);
-  void (*clear_cache)(const cfp_array2d* self);
-  void (*flush_cache)(const cfp_array2d* self);
-  size_t (*compressed_size)(const cfp_array2d* self);
-  uchar* (*compressed_data)(const cfp_array2d* self);
-  size_t (*size)(const cfp_array2d* self);
-  uint (*size_x)(const cfp_array2d* self);
-  uint (*size_y)(const cfp_array2d* self);
-  void (*resize)(cfp_array2d* self, uint nx, uint ny, int clear);
-
-  void (*get_array)(const cfp_array2d* self, double* p);
-  void (*set_array)(cfp_array2d* self, const double* p);
-  double (*get_flat)(const cfp_array2d* self, uint i);
-  void (*set_flat)(cfp_array2d* self, uint i, double val);
-  double (*get)(const cfp_array2d* self, uint i, uint j);
-  void (*set)(cfp_array2d* self, uint i, uint j, double val);
-} cfp_array2d_api;
-
-#endif
diff --git a/cfp/include/cfparray2f.h b/cfp/include/cfparray2f.h
deleted file mode 100644
index a531ac24..00000000
--- a/cfp/include/cfparray2f.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef CFP_ARRAY_2F
-#define CFP_ARRAY_2F
-
-#include <stddef.h>
-#include "zfp/types.h"
-
-struct cfp_array2f;
-typedef struct cfp_array2f cfp_array2f;
-
-typedef struct {
-  cfp_array2f* (*ctor_default)();
-  cfp_array2f* (*ctor)(uint nx, uint ny, double rate, const float* p, size_t csize);
-  cfp_array2f* (*ctor_copy)(const cfp_array2f* src);
-  void (*dtor)(cfp_array2f* self);
-
-  void (*deep_copy)(cfp_array2f* self, const cfp_array2f* src);
-
-  double (*rate)(const cfp_array2f* self);
-  double (*set_rate)(cfp_array2f* self, double rate);
-  size_t (*cache_size)(const cfp_array2f* self);
-  void (*set_cache_size)(cfp_array2f* self, size_t csize);
-  void (*clear_cache)(const cfp_array2f* self);
-  void (*flush_cache)(const cfp_array2f* self);
-  size_t (*compressed_size)(const cfp_array2f* self);
-  uchar* (*compressed_data)(const cfp_array2f* self);
-  size_t (*size)(const cfp_array2f* self);
-  uint (*size_x)(const cfp_array2f* self);
-  uint (*size_y)(const cfp_array2f* self);
-  void (*resize)(cfp_array2f* self, uint nx, uint ny, int clear);
-
-  void (*get_array)(const cfp_array2f* self, float* p);
-  void (*set_array)(cfp_array2f* self, const float* p);
-  float (*get_flat)(const cfp_array2f* self, uint i);
-  void (*set_flat)(cfp_array2f* self, uint i, float val);
-  float (*get)(const cfp_array2f* self, uint i, uint j);
-  void (*set)(cfp_array2f* self, uint i, uint j, float val);
-} cfp_array2f_api;
-
-#endif
diff --git a/cfp/include/cfparray3d.h b/cfp/include/cfparray3d.h
deleted file mode 100644
index 8390a619..00000000
--- a/cfp/include/cfparray3d.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef CFP_ARRAY_3D
-#define CFP_ARRAY_3D
-
-#include <stddef.h>
-#include "zfp/types.h"
-
-struct cfp_array3d;
-typedef struct cfp_array3d cfp_array3d;
-
-typedef struct {
-  cfp_array3d* (*ctor_default)();
-  cfp_array3d* (*ctor)(uint nx, uint ny, uint nz, double rate, const double* p, size_t csize);
-  cfp_array3d* (*ctor_copy)(const cfp_array3d* src);
-  void (*dtor)(cfp_array3d* self);
-
-  void (*deep_copy)(cfp_array3d* self, const cfp_array3d* src);
-
-  double (*rate)(const cfp_array3d* self);
-  double (*set_rate)(cfp_array3d* self, double rate);
-  size_t (*cache_size)(const cfp_array3d* self);
-  void (*set_cache_size)(cfp_array3d* self, size_t csize);
-  void (*clear_cache)(const cfp_array3d* self);
-  void (*flush_cache)(const cfp_array3d* self);
-  size_t (*compressed_size)(const cfp_array3d* self);
-  uchar* (*compressed_data)(const cfp_array3d* self);
-  size_t (*size)(const cfp_array3d* self);
-  uint (*size_x)(const cfp_array3d* self);
-  uint (*size_y)(const cfp_array3d* self);
-  uint (*size_z)(const cfp_array3d* self);
-  void (*resize)(cfp_array3d* self, uint nx, uint ny, uint nz, int clear);
-
-  void (*get_array)(const cfp_array3d* self, double* p);
-  void (*set_array)(cfp_array3d* self, const double* p);
-  double (*get_flat)(const cfp_array3d* self, uint i);
-  void (*set_flat)(cfp_array3d* self, uint i, double val);
-  double (*get)(const cfp_array3d* self, uint i, uint j, uint k);
-  void (*set)(cfp_array3d* self, uint i, uint j, uint k, double val);
-} cfp_array3d_api;
-
-#endif
diff --git a/cfp/include/cfparray3f.h b/cfp/include/cfparray3f.h
deleted file mode 100644
index 0261df31..00000000
--- a/cfp/include/cfparray3f.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef CFP_ARRAY_3F
-#define CFP_ARRAY_3F
-
-#include <stddef.h>
-#include "zfp/types.h"
-
-struct cfp_array3f;
-typedef struct cfp_array3f cfp_array3f;
-
-typedef struct {
-  cfp_array3f* (*ctor_default)();
-  cfp_array3f* (*ctor)(uint nx, uint ny, uint nz, double rate, const float* p, size_t csize);
-  cfp_array3f* (*ctor_copy)(const cfp_array3f* src);
-  void (*dtor)(cfp_array3f* self);
-
-  void (*deep_copy)(cfp_array3f* self, const cfp_array3f* src);
-
-  double (*rate)(const cfp_array3f* self);
-  double (*set_rate)(cfp_array3f* self, double rate);
-  size_t (*cache_size)(const cfp_array3f* self);
-  void (*set_cache_size)(cfp_array3f* self, size_t csize);
-  void (*clear_cache)(const cfp_array3f* self);
-  void (*flush_cache)(const cfp_array3f* self);
-  size_t (*compressed_size)(const cfp_array3f* self);
-  uchar* (*compressed_data)(const cfp_array3f* self);
-  size_t (*size)(const cfp_array3f* self);
-  uint (*size_x)(const cfp_array3f* self);
-  uint (*size_y)(const cfp_array3f* self);
-  uint (*size_z)(const cfp_array3f* self);
-  void (*resize)(cfp_array3f* self, uint nx, uint ny, uint nz, int clear);
-
-  void (*get_array)(const cfp_array3f* self, float* p);
-  void (*set_array)(cfp_array3f* self, const float* p);
-  float (*get_flat)(const cfp_array3f* self, uint i);
-  void (*set_flat)(cfp_array3f* self, uint i, float val);
-  float (*get)(const cfp_array3f* self, uint i, uint j, uint k);
-  void (*set)(cfp_array3f* self, uint i, uint j, uint k, float val);
-} cfp_array3f_api;
-
-#endif
diff --git a/cfp/include/cfparrays.h b/cfp/include/cfparrays.h
deleted file mode 100644
index f716d828..00000000
--- a/cfp/include/cfparrays.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef CFP_ARRAYS
-#define CFP_ARRAYS
-
-#include "cfparray1f.h"
-#include "cfparray1d.h"
-#include "cfparray2f.h"
-#include "cfparray2d.h"
-#include "cfparray3f.h"
-#include "cfparray3d.h"
-
-#include "zfp/system.h"
-
-typedef struct {
-  cfp_array1f_api array1f;
-  cfp_array1d_api array1d;
-  cfp_array2f_api array2f;
-  cfp_array2d_api array2d;
-  cfp_array3f_api array3f;
-  cfp_array3d_api array3d;
-} cfp_api;
-
-#ifndef CFP_NAMESPACE
-  #define CFP_NAMESPACE cfp
-#endif
-
-extern_ const cfp_api CFP_NAMESPACE;
-
-#endif
diff --git a/cfp/src/CMakeLists.txt b/cfp/src/CMakeLists.txt
deleted file mode 100644
index 386698c6..00000000
--- a/cfp/src/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-add_library(cfp cfparrays.cpp)
-
-if(DEFINED CFP_NAMESPACE)
-  list(APPEND cfp_public_defs "CFP_NAMESPACE=${CFP_NAMESPACE}")
-endif()
-
-list(APPEND cfp_private_defs ${zfp_compressed_array_defs})
-
-if(WIN32)
-  # define ZFP_SOURCE when compiling libcfp to export symbols to Windows DLL
-  list(APPEND cfp_private_defs ZFP_SOURCE)
-endif()
-
-target_compile_definitions(cfp
-  PUBLIC ${cfp_public_defs}
-  PRIVATE ${cfp_private_defs})
-
-target_include_directories(cfp
-  PUBLIC
-    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
-    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/cfp/include>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  PRIVATE
-    ${ZFP_SOURCE_DIR}/array
-    ${ZFP_SOURCE_DIR}/src
-)
-
-target_link_libraries(cfp zfp)
-
-set_property(TARGET cfp PROPERTY VERSION ${ZFP_VERSION})
-set_property(TARGET cfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
-set_property(TARGET cfp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}cfp)
-
-install(TARGETS cfp EXPORT cfp-targets
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/cfp/src/cfparray1_source.cpp b/cfp/src/cfparray1_source.cpp
deleted file mode 100644
index bdab414d..00000000
--- a/cfp/src/cfparray1_source.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-static CFP_ARRAY_TYPE *
-_t1(CFP_ARRAY_TYPE, ctor)(uint n, double rate, const ZFP_SCALAR_TYPE * p, size_t csize)
-{
-  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(n, rate, p, csize));
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint n, int clear)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(n, clear);
-}
-
-static ZFP_SCALAR_TYPE
-_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i);
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, ZFP_SCALAR_TYPE val)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i) = val;
-}
diff --git a/cfp/src/cfparray1d.cpp b/cfp/src/cfparray1d.cpp
deleted file mode 100644
index 1e71b0d2..00000000
--- a/cfp/src/cfparray1d.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "cfparray1d.h"
-#include "zfparray1.h"
-
-#include "template/template.h"
-
-#define CFP_ARRAY_TYPE cfp_array1d
-#define ZFP_ARRAY_TYPE zfp::array1d
-#define ZFP_SCALAR_TYPE double
-
-#include "cfparray_source.cpp"
-#include "cfparray1_source.cpp"
-
-#undef CFP_ARRAY_TYPE
-#undef ZFP_ARRAY_TYPE
-#undef ZFP_SCALAR_TYPE
diff --git a/cfp/src/cfparray1f.cpp b/cfp/src/cfparray1f.cpp
deleted file mode 100644
index 56ecda58..00000000
--- a/cfp/src/cfparray1f.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "cfparray1f.h"
-#include "zfparray1.h"
-
-#include "template/template.h"
-
-#define CFP_ARRAY_TYPE cfp_array1f
-#define ZFP_ARRAY_TYPE zfp::array1f
-#define ZFP_SCALAR_TYPE float
-
-#include "cfparray_source.cpp"
-#include "cfparray1_source.cpp"
-
-#undef CFP_ARRAY_TYPE
-#undef ZFP_ARRAY_TYPE
-#undef ZFP_SCALAR_TYPE
diff --git a/cfp/src/cfparray2_source.cpp b/cfp/src/cfparray2_source.cpp
deleted file mode 100644
index 6135ae40..00000000
--- a/cfp/src/cfparray2_source.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-static CFP_ARRAY_TYPE *
-_t1(CFP_ARRAY_TYPE, ctor)(uint nx, uint ny, double rate, const ZFP_SCALAR_TYPE * p, size_t csize)
-{
-  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(nx, ny, rate, p, csize));
-}
-
-static uint
-_t1(CFP_ARRAY_TYPE, size_x)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_x();
-}
-
-static uint
-_t1(CFP_ARRAY_TYPE, size_y)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_y();
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint nx, uint ny, int clear)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(nx, ny, clear);
-}
-
-static ZFP_SCALAR_TYPE
-_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i, uint j)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i, j);
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, uint j, ZFP_SCALAR_TYPE val)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i, j) = val;
-}
diff --git a/cfp/src/cfparray2d.cpp b/cfp/src/cfparray2d.cpp
deleted file mode 100644
index 3debb2b8..00000000
--- a/cfp/src/cfparray2d.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "cfparray2d.h"
-#include "zfparray2.h"
-
-#include "template/template.h"
-
-#define CFP_ARRAY_TYPE cfp_array2d
-#define ZFP_ARRAY_TYPE zfp::array2d
-#define ZFP_SCALAR_TYPE double
-
-#include "cfparray_source.cpp"
-#include "cfparray2_source.cpp"
-
-#undef CFP_ARRAY_TYPE
-#undef ZFP_ARRAY_TYPE
-#undef ZFP_SCALAR_TYPE
diff --git a/cfp/src/cfparray2f.cpp b/cfp/src/cfparray2f.cpp
deleted file mode 100644
index 37407cc8..00000000
--- a/cfp/src/cfparray2f.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "cfparray2f.h"
-#include "zfparray2.h"
-
-#include "template/template.h"
-
-#define CFP_ARRAY_TYPE cfp_array2f
-#define ZFP_ARRAY_TYPE zfp::array2f
-#define ZFP_SCALAR_TYPE float
-
-#include "cfparray_source.cpp"
-#include "cfparray2_source.cpp"
-
-#undef CFP_ARRAY_TYPE
-#undef ZFP_ARRAY_TYPE
-#undef ZFP_SCALAR_TYPE
diff --git a/cfp/src/cfparray3_source.cpp b/cfp/src/cfparray3_source.cpp
deleted file mode 100644
index ae2ebf6d..00000000
--- a/cfp/src/cfparray3_source.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-static CFP_ARRAY_TYPE *
-_t1(CFP_ARRAY_TYPE, ctor)(uint nx, uint ny, uint nz, double rate, const ZFP_SCALAR_TYPE * p, size_t csize)
-{
-  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE(nx, ny, nz, rate, p, csize));
-}
-
-static uint
-_t1(CFP_ARRAY_TYPE, size_x)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_x();
-}
-
-static uint
-_t1(CFP_ARRAY_TYPE, size_y)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_y();
-}
-
-static uint
-_t1(CFP_ARRAY_TYPE, size_z)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size_z();
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE * self, uint nx, uint ny, uint nz, int clear)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->resize(nx, ny, nz, clear);
-}
-
-static ZFP_SCALAR_TYPE
-_t1(CFP_ARRAY_TYPE, get)(const CFP_ARRAY_TYPE * self, uint i, uint j, uint k)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator()(i, j, k);
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE * self, uint i, uint j, uint k, ZFP_SCALAR_TYPE val)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator()(i, j, k) = val;
-}
diff --git a/cfp/src/cfparray3d.cpp b/cfp/src/cfparray3d.cpp
deleted file mode 100644
index fb5cc2e2..00000000
--- a/cfp/src/cfparray3d.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "cfparray3d.h"
-#include "zfparray3.h"
-
-#include "template/template.h"
-
-#define CFP_ARRAY_TYPE cfp_array3d
-#define ZFP_ARRAY_TYPE zfp::array3d
-#define ZFP_SCALAR_TYPE double
-
-#include "cfparray_source.cpp"
-#include "cfparray3_source.cpp"
-
-#undef CFP_ARRAY_TYPE
-#undef ZFP_ARRAY_TYPE
-#undef ZFP_SCALAR_TYPE
diff --git a/cfp/src/cfparray3f.cpp b/cfp/src/cfparray3f.cpp
deleted file mode 100644
index 69331b1c..00000000
--- a/cfp/src/cfparray3f.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "cfparray3f.h"
-#include "zfparray3.h"
-
-#include "template/template.h"
-
-#define CFP_ARRAY_TYPE cfp_array3f
-#define ZFP_ARRAY_TYPE zfp::array3f
-#define ZFP_SCALAR_TYPE float
-
-#include "cfparray_source.cpp"
-#include "cfparray3_source.cpp"
-
-#undef CFP_ARRAY_TYPE
-#undef ZFP_ARRAY_TYPE
-#undef ZFP_SCALAR_TYPE
diff --git a/cfp/src/cfparray_source.cpp b/cfp/src/cfparray_source.cpp
deleted file mode 100644
index d94e1a49..00000000
--- a/cfp/src/cfparray_source.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-// common constructor, destructor
-static CFP_ARRAY_TYPE *
-_t1(CFP_ARRAY_TYPE, ctor_default)()
-{
-  return reinterpret_cast<CFP_ARRAY_TYPE *>(new ZFP_ARRAY_TYPE());
-}
-
-static CFP_ARRAY_TYPE *
-_t1(CFP_ARRAY_TYPE, ctor_copy)(const CFP_ARRAY_TYPE * src)
-{
-  return reinterpret_cast<CFP_ARRAY_TYPE *>(
-    new ZFP_ARRAY_TYPE(*reinterpret_cast<const ZFP_ARRAY_TYPE *>(src))
-  );
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, dtor)(CFP_ARRAY_TYPE * self)
-{
-  delete reinterpret_cast<ZFP_ARRAY_TYPE *>(self);
-}
-
-// functions defined in zfparray.h (base class)
-static double
-_t1(CFP_ARRAY_TYPE, rate)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->rate();
-}
-
-static double
-_t1(CFP_ARRAY_TYPE, set_rate)(CFP_ARRAY_TYPE * self, double rate)
-{
-  return reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set_rate(rate);
-}
-
-static size_t
-_t1(CFP_ARRAY_TYPE, compressed_size)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->compressed_size();
-}
-
-static uchar*
-_t1(CFP_ARRAY_TYPE, compressed_data)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->compressed_data();
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, deep_copy)(CFP_ARRAY_TYPE * self, const CFP_ARRAY_TYPE * src)
-{
-  *reinterpret_cast<ZFP_ARRAY_TYPE *>(self) = *reinterpret_cast<const ZFP_ARRAY_TYPE *>(src);
-}
-
-// functions defined in subclasses
-static size_t
-_t1(CFP_ARRAY_TYPE, size)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->size();
-}
-
-static size_t
-_t1(CFP_ARRAY_TYPE, cache_size)(const CFP_ARRAY_TYPE * self)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->cache_size();
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, set_cache_size)(CFP_ARRAY_TYPE * self, size_t csize)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set_cache_size(csize);
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, clear_cache)(const CFP_ARRAY_TYPE * self)
-{
-  reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->clear_cache();
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, flush_cache)(const CFP_ARRAY_TYPE * self)
-{
-  reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->flush_cache();
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, get_array)(const CFP_ARRAY_TYPE * self, ZFP_SCALAR_TYPE * p)
-{
-  reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->get(p);
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, set_array)(CFP_ARRAY_TYPE * self, const ZFP_SCALAR_TYPE * p)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->set(p);
-}
-
-static ZFP_SCALAR_TYPE
-_t1(CFP_ARRAY_TYPE, get_flat)(const CFP_ARRAY_TYPE * self, uint i)
-{
-  return reinterpret_cast<const ZFP_ARRAY_TYPE *>(self)->operator[](i);
-}
-
-static void
-_t1(CFP_ARRAY_TYPE, set_flat)(CFP_ARRAY_TYPE * self, uint i, ZFP_SCALAR_TYPE val)
-{
-  reinterpret_cast<ZFP_ARRAY_TYPE *>(self)->operator[](i) = val;
-}
diff --git a/cfp/src/cfparrays.cpp b/cfp/src/cfparrays.cpp
deleted file mode 100644
index bcd88686..00000000
--- a/cfp/src/cfparrays.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#include "cfparrays.h"
-
-#include "cfparray1f.cpp"
-#include "cfparray1d.cpp"
-#include "cfparray2f.cpp"
-#include "cfparray2d.cpp"
-#include "cfparray3f.cpp"
-#include "cfparray3d.cpp"
-
-export_ const cfp_api CFP_NAMESPACE = {
-  // array1f
-  {
-    cfp_array1f_ctor_default,
-    cfp_array1f_ctor,
-    cfp_array1f_ctor_copy,
-    cfp_array1f_dtor,
-
-    cfp_array1f_deep_copy,
-
-    cfp_array1f_rate,
-    cfp_array1f_set_rate,
-    cfp_array1f_cache_size,
-    cfp_array1f_set_cache_size,
-    cfp_array1f_clear_cache,
-    cfp_array1f_flush_cache,
-    cfp_array1f_compressed_size,
-    cfp_array1f_compressed_data,
-    cfp_array1f_size,
-    cfp_array1f_resize,
-
-    cfp_array1f_get_array,
-    cfp_array1f_set_array,
-    cfp_array1f_get_flat,
-    cfp_array1f_set_flat,
-    cfp_array1f_get,
-    cfp_array1f_set,
-  },
-  // array1d
-  {
-    cfp_array1d_ctor_default,
-    cfp_array1d_ctor,
-    cfp_array1d_ctor_copy,
-    cfp_array1d_dtor,
-
-    cfp_array1d_deep_copy,
-
-    cfp_array1d_rate,
-    cfp_array1d_set_rate,
-    cfp_array1d_cache_size,
-    cfp_array1d_set_cache_size,
-    cfp_array1d_clear_cache,
-    cfp_array1d_flush_cache,
-    cfp_array1d_compressed_size,
-    cfp_array1d_compressed_data,
-    cfp_array1d_size,
-    cfp_array1d_resize,
-
-    cfp_array1d_get_array,
-    cfp_array1d_set_array,
-    cfp_array1d_get_flat,
-    cfp_array1d_set_flat,
-    cfp_array1d_get,
-    cfp_array1d_set,
-  },
-  // array2f
-  {
-    cfp_array2f_ctor_default,
-    cfp_array2f_ctor,
-    cfp_array2f_ctor_copy,
-    cfp_array2f_dtor,
-
-    cfp_array2f_deep_copy,
-
-    cfp_array2f_rate,
-    cfp_array2f_set_rate,
-    cfp_array2f_cache_size,
-    cfp_array2f_set_cache_size,
-    cfp_array2f_clear_cache,
-    cfp_array2f_flush_cache,
-    cfp_array2f_compressed_size,
-    cfp_array2f_compressed_data,
-    cfp_array2f_size,
-    cfp_array2f_size_x,
-    cfp_array2f_size_y,
-    cfp_array2f_resize,
-
-    cfp_array2f_get_array,
-    cfp_array2f_set_array,
-    cfp_array2f_get_flat,
-    cfp_array2f_set_flat,
-    cfp_array2f_get,
-    cfp_array2f_set,
-  },
-  // array2d
-  {
-    cfp_array2d_ctor_default,
-    cfp_array2d_ctor,
-    cfp_array2d_ctor_copy,
-    cfp_array2d_dtor,
-
-    cfp_array2d_deep_copy,
-
-    cfp_array2d_rate,
-    cfp_array2d_set_rate,
-    cfp_array2d_cache_size,
-    cfp_array2d_set_cache_size,
-    cfp_array2d_clear_cache,
-    cfp_array2d_flush_cache,
-    cfp_array2d_compressed_size,
-    cfp_array2d_compressed_data,
-    cfp_array2d_size,
-    cfp_array2d_size_x,
-    cfp_array2d_size_y,
-    cfp_array2d_resize,
-
-    cfp_array2d_get_array,
-    cfp_array2d_set_array,
-    cfp_array2d_get_flat,
-    cfp_array2d_set_flat,
-    cfp_array2d_get,
-    cfp_array2d_set,
-  },
-  // array3f
-  {
-    cfp_array3f_ctor_default,
-    cfp_array3f_ctor,
-    cfp_array3f_ctor_copy,
-    cfp_array3f_dtor,
-
-    cfp_array3f_deep_copy,
-
-    cfp_array3f_rate,
-    cfp_array3f_set_rate,
-    cfp_array3f_cache_size,
-    cfp_array3f_set_cache_size,
-    cfp_array3f_clear_cache,
-    cfp_array3f_flush_cache,
-    cfp_array3f_compressed_size,
-    cfp_array3f_compressed_data,
-    cfp_array3f_size,
-    cfp_array3f_size_x,
-    cfp_array3f_size_y,
-    cfp_array3f_size_z,
-    cfp_array3f_resize,
-
-    cfp_array3f_get_array,
-    cfp_array3f_set_array,
-    cfp_array3f_get_flat,
-    cfp_array3f_set_flat,
-    cfp_array3f_get,
-    cfp_array3f_set,
-  },
-  // array3d
-  {
-    cfp_array3d_ctor_default,
-    cfp_array3d_ctor,
-    cfp_array3d_ctor_copy,
-    cfp_array3d_dtor,
-
-    cfp_array3d_deep_copy,
-
-    cfp_array3d_rate,
-    cfp_array3d_set_rate,
-    cfp_array3d_cache_size,
-    cfp_array3d_set_cache_size,
-    cfp_array3d_clear_cache,
-    cfp_array3d_flush_cache,
-    cfp_array3d_compressed_size,
-    cfp_array3d_compressed_data,
-    cfp_array3d_size,
-    cfp_array3d_size_x,
-    cfp_array3d_size_y,
-    cfp_array3d_size_z,
-    cfp_array3d_resize,
-
-    cfp_array3d_get_array,
-    cfp_array3d_set_array,
-    cfp_array3d_get_flat,
-    cfp_array3d_set_flat,
-    cfp_array3d_get,
-    cfp_array3d_set,
-  },
-};
diff --git a/cfp/template/cfparray.cpp b/cfp/template/cfparray.cpp
new file mode 100644
index 00000000..70bb3c2d
--- /dev/null
+++ b/cfp/template/cfparray.cpp
@@ -0,0 +1,136 @@
+// common constructor, destructor
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor_default)()
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE();
+  return a;
+}
+
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor_copy)(CFP_ARRAY_TYPE src)
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE(*static_cast<const ZFP_ARRAY_TYPE *>(src.object));
+  return a;
+}
+
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor_header)(CFP_HEADER_TYPE h, const void* buffer, size_t buffer_size_bytes)
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE(*static_cast<zfp::array::header*>(h.object), buffer, buffer_size_bytes);
+  return a;
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, dtor)(CFP_ARRAY_TYPE self)
+{
+  delete static_cast<ZFP_ARRAY_TYPE*>(self.object);
+}
+
+// functions defined in zfparray.h (base class)
+static double
+_t1(CFP_ARRAY_TYPE, rate)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->rate();
+}
+
+static double
+_t1(CFP_ARRAY_TYPE, set_rate)(CFP_ARRAY_TYPE self, double rate)
+{
+  return static_cast<ZFP_ARRAY_TYPE*>(self.object)->set_rate(rate);
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_bytes)(CFP_ARRAY_TYPE self, uint mask)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_bytes(mask);
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, compressed_size)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->compressed_size();
+}
+
+static void*
+_t1(CFP_ARRAY_TYPE, compressed_data)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->compressed_data();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, deep_copy)(CFP_ARRAY_TYPE self, const CFP_ARRAY_TYPE src)
+{
+  *static_cast<ZFP_ARRAY_TYPE*>(self.object) = *static_cast<const ZFP_ARRAY_TYPE*>(src.object);
+}
+
+// functions defined in subclasses
+static size_t
+_t1(CFP_ARRAY_TYPE, size)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, cache_size)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->cache_size();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set_cache_size)(CFP_ARRAY_TYPE self, size_t bytes)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->set_cache_size(bytes);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, clear_cache)(CFP_ARRAY_TYPE self)
+{
+  static_cast<const ZFP_ARRAY_TYPE*>(self.object)->clear_cache();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, flush_cache)(CFP_ARRAY_TYPE self)
+{
+  static_cast<const ZFP_ARRAY_TYPE*>(self.object)->flush_cache();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, get_array)(CFP_ARRAY_TYPE self, ZFP_SCALAR_TYPE * p)
+{
+  static_cast<const ZFP_ARRAY_TYPE*>(self.object)->get(p);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set_array)(CFP_ARRAY_TYPE self, const ZFP_SCALAR_TYPE * p)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->set(p);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->operator[](i);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set_flat)(CFP_ARRAY_TYPE self, size_t i, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->operator[](i) = val;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, ptr)(CFP_REF_TYPE self)
+{
+  CFP_PTR_TYPE p;
+  p.reference = self;
+  return p;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, ref)(CFP_PTR_TYPE self)
+{
+  return self.reference;
+}
diff --git a/cfp/template/cfparray1.cpp b/cfp/template/cfparray1.cpp
new file mode 100644
index 00000000..6ebc19ce
--- /dev/null
+++ b/cfp/template/cfparray1.cpp
@@ -0,0 +1,332 @@
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor)(size_t n, double rate, const ZFP_SCALAR_TYPE* p, size_t cache_size)
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE(n, rate, p, cache_size);
+  return a;
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE self, size_t n, zfp_bool clear)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->resize(n, !!clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(CFP_ARRAY_TYPE self, size_t i)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->operator()(i);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE self, size_t i, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->operator()(i) = val;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  r.x = i;
+  return r;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  r.x = i;
+  return r;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref)(self, i);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref_flat)(self, i);
+  return p;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, begin)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, end)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_x();
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, get)(CFP_REF_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, set)(CFP_REF_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, copy)(CFP_REF_TYPE self, CFP_REF_TYPE src)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x) =
+    static_cast<const ZFP_ARRAY_TYPE*>(src.array.object)->operator()(src.x);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, lt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x < rhs.reference.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, gt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x > rhs.reference.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, leq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x <= rhs.reference.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, geq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x >= rhs.reference.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x == rhs.reference.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, neq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, distance)(CFP_PTR_TYPE first, CFP_PTR_TYPE last)
+{
+  return last.reference.x - first.reference.x;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  p.reference.x += d;
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, prev)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  p.reference.x -= d;
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, inc)(CFP_PTR_TYPE p)
+{
+  p.reference.x++;
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, dec)(CFP_PTR_TYPE p)
+{
+  p.reference.x--;
+  return p;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get)(CFP_PTR_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set)(CFP_PTR_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set_at)(CFP_PTR_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, ref_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return self.reference;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, lt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && lhs.x < rhs.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, gt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && lhs.x > rhs.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, leq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && lhs.x <= rhs.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, geq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && lhs.x >= rhs.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && lhs.x == rhs.x;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, neq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, distance)(CFP_ITER_TYPE first, CFP_ITER_TYPE last)
+{
+   return last.x - first.x;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  it.x += d;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, prev)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  it.x -= d;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, inc)(CFP_ITER_TYPE it)
+{
+  it.x++;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, dec)(CFP_ITER_TYPE it)
+{
+  it.x--;
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get)(CFP_ITER_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set)(CFP_ITER_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set_at)(CFP_ITER_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x);
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x);
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, i)(CFP_ITER_TYPE self)
+{
+  return self.x;
+}
diff --git a/cfp/template/cfparray2.cpp b/cfp/template/cfparray2.cpp
new file mode 100644
index 00000000..f919ff09
--- /dev/null
+++ b/cfp/template/cfparray2.cpp
@@ -0,0 +1,468 @@
+// utility function: compute onedimensional offset from multidimensional index
+static ptrdiff_t
+ref_offset(const CFP_REF_TYPE& self)
+{
+  size_t nx = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_x();
+  return static_cast<ptrdiff_t>(self.x + nx * self.y);
+}
+
+// utility function: compute multidimensional index from onedimensional offset
+static void
+ref_set_offset(CFP_REF_TYPE& self, size_t offset)
+{
+  size_t nx = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_x();
+  self.x = offset % nx; offset /= nx;
+  self.y = offset;
+}
+
+// utility function: compute onedimensional offset from multidimensional index
+static ptrdiff_t
+iter_offset(const CFP_ITER_TYPE& self)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t nx = xmax - xmin;
+  size_t ny = ymax - ymin;
+  size_t x = self.x;
+  size_t y = self.y;
+  size_t p = 0;
+  if (y == ymax)
+    p += nx * ny;
+  else {
+    size_t m = ~size_t(3);
+    size_t by = std::max(y & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p += (by - ymin) * nx;
+    size_t bx = std::max(x & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p += (bx - xmin) * sy;
+    p += (y - by) * sx;
+    p += (x - bx);
+  }
+  return static_cast<ptrdiff_t>(p);
+}
+
+// utility function: compute multidimensional index from onedimensional offset
+static void
+iter_set_offset(CFP_ITER_TYPE& self, size_t offset)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t nx = xmax - xmin;
+  size_t ny = ymax - ymin;
+  size_t p = offset;
+  size_t x, y;
+  if (p == nx * ny) {
+    x = xmin;
+    y = ymax;
+  }
+  else {
+    size_t m = ~size_t(3);
+    size_t by = std::max((ymin + p / nx) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx;
+    size_t bx = std::max((xmin + p / sy) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy;
+    y = by + p / sx; p -= (y - by) * sx;
+    x = bx + p;      p -= (x - bx);
+  }
+  self.x = x;
+  self.y = y;
+}
+
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor)(size_t nx, size_t ny, double rate, const ZFP_SCALAR_TYPE* p, size_t cache_size)
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE(nx, ny, rate, p, cache_size);
+  return a;
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_x)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_x();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_y)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_y();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE self, size_t nx, size_t ny, zfp_bool clear)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->resize(nx, ny, !!clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(CFP_ARRAY_TYPE self, size_t i, size_t j)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->operator()(i, j);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE self, size_t i, size_t j, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->operator()(i, j) = val;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref)(CFP_ARRAY_TYPE self, size_t i, size_t j)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  r.x = i;
+  r.y = j;
+  return r;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  ref_set_offset(r, i);
+  return r;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr)(CFP_ARRAY_TYPE self, size_t i, size_t j)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref)(self, i, j);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref_flat)(self, i);
+  return p;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, begin)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  it.y = 0;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, end)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  it.y = static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_y();
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, get)(CFP_REF_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, set)(CFP_REF_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, copy)(CFP_REF_TYPE self, CFP_REF_TYPE src)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y) =
+    static_cast<const ZFP_ARRAY_TYPE*>(src.array.object)->operator()(src.x, src.y);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, lt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) < ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, gt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) > ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, leq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) <= ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, geq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) >= ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x == rhs.reference.x &&
+         lhs.reference.y == rhs.reference.y;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, neq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, distance)(CFP_PTR_TYPE first, CFP_PTR_TYPE last)
+{
+  return ref_offset(last.reference) - ref_offset(first.reference);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  ref_set_offset(p.reference, ref_offset(p.reference) + d);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, prev)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, -d);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, inc)(CFP_PTR_TYPE p)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, +1);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, dec)(CFP_PTR_TYPE p)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, -1);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get)(CFP_PTR_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set)(CFP_PTR_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set_at)(CFP_PTR_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, ref_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return self.reference;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, lt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) < iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, gt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) > iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, leq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) <= iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, geq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) >= iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object &&
+         lhs.x == rhs.x &&
+         lhs.y == rhs.y;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, neq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, distance)(CFP_ITER_TYPE first, CFP_ITER_TYPE last)
+{
+  return iter_offset(last) - iter_offset(first);
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  iter_set_offset(it, iter_offset(it) + d);
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, prev)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  iter_set_offset(it, iter_offset(it) - d);
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, inc)(CFP_ITER_TYPE it)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(it.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t m = ~size_t(3);
+  ++it.x;
+  if (!(it.x & 3u) || it.x == xmax) {
+    it.x = std::max((it.x - 1) & m, xmin);
+    ++it.y;
+    if (!(it.y & 3u) || it.y == ymax) {
+      it.y = std::max((it.y - 1) & m, ymin);
+      // done with block; advance to next
+      it.x = (it.x + 4) & m;
+      if (it.x >= xmax) {
+        it.x = xmin;
+        it.y = (it.y + 4) & m;
+        if (it.y >= ymax)
+          it.y = ymax;
+      }
+    }
+  }
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, dec)(CFP_ITER_TYPE it)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(it.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t m = ~size_t(3);
+  if (it.y == ymax) {
+    it.x = xmax - 1;
+    it.y = ymax - 1;
+  }
+  else {
+    if (!(it.x & 3u) || it.x == xmin) {
+      it.x = std::min((it.x + 4) & m, xmax);
+      if (!(it.y & 3u) || it.y == ymin) {
+        it.y = std::min((it.y + 4) & m, ymax);
+        // done with block; advance to next
+        it.x = (it.x - 1) & m;
+        if (it.x <= xmin) {
+          it.x = xmax;
+          it.y = (it.y - 1) & m;
+          if (it.y <= ymin)
+            it.y = ymin;
+        }
+      }
+      --it.y;
+    }
+    --it.x;
+  }
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get)(CFP_ITER_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set)(CFP_ITER_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set_at)(CFP_ITER_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x, self.y);
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x, self.y);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x, self.y);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x, self.y);
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, i)(CFP_ITER_TYPE self)
+{
+  return self.x;
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, j)(CFP_ITER_TYPE self)
+{
+  return self.y;
+}
diff --git a/cfp/template/cfparray3.cpp b/cfp/template/cfparray3.cpp
new file mode 100644
index 00000000..efbdc3ce
--- /dev/null
+++ b/cfp/template/cfparray3.cpp
@@ -0,0 +1,522 @@
+// utility function: compute onedimensional offset from multidimensional index
+static ptrdiff_t
+ref_offset(const CFP_REF_TYPE& self)
+{
+  size_t nx = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_x();
+  size_t ny = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_y();
+  return static_cast<ptrdiff_t>(self.x + nx * (self.y + ny * self.z));
+}
+
+// utility function: compute multidimensional index from onedimensional offset
+static void
+ref_set_offset(CFP_REF_TYPE& self, size_t offset)
+{
+  size_t nx = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_x();
+  size_t ny = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_y();
+  self.x = offset % nx; offset /= nx;
+  self.y = offset % ny; offset /= ny;
+  self.z = offset;
+}
+
+// utility function: compute onedimensional offset from multidimensional index
+static ptrdiff_t
+iter_offset(const CFP_ITER_TYPE& self)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t nx = xmax - xmin;
+  size_t ny = ymax - ymin;
+  size_t nz = zmax - zmin;
+  size_t x = self.x;
+  size_t y = self.y;
+  size_t z = self.z;
+  size_t p = 0;
+  if (z == zmax)
+    p += nx * ny * nz;
+  else {
+    size_t m = ~size_t(3);
+    size_t bz = std::max(z & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p += (bz - zmin) * nx * ny;
+    size_t by = std::max(y & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p += (by - ymin) * nx * sz;
+    size_t bx = std::max(x & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p += (bx - xmin) * sy * sz;
+    p += (z - bz) * sx * sy;
+    p += (y - by) * sx;
+    p += (x - bx);
+  }
+  return p;
+}
+
+// utility function: compute multidimensional index from onedimensional offset
+static void
+iter_set_offset(CFP_ITER_TYPE& self, size_t offset)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t nx = xmax - xmin;
+  size_t ny = ymax - ymin;
+  size_t nz = zmax - zmin;
+  size_t p = offset;
+  size_t x, y, z;
+  if (p == nx * ny * nz) {
+    x = xmin;
+    y = ymin;
+    z = zmax;
+  }
+  else {
+    size_t m = ~size_t(3);
+    size_t bz = std::max((zmin + p / (nx * ny)) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny;
+    size_t by = std::max((ymin + p / (nx * sz)) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz;
+    size_t bx = std::max((xmin + p / (sy * sz)) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz;
+    z = bz + p / (sx * sy); p -= (z - bz) * sx * sy;
+    y = by + p / sx;        p -= (y - by) * sx;
+    x = bx + p;             p -= (x - bx);
+  }
+  self.x = x;
+  self.y = y;
+  self.z = z;
+}
+
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor)(size_t nx, size_t ny, size_t nz, double rate, const ZFP_SCALAR_TYPE* p, size_t cache_size)
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE(nx, ny, nz, rate, p, cache_size);
+  return a;
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_x)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_x();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_y)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_y();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_z)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_z();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE self, size_t nx, size_t ny, size_t nz, zfp_bool clear)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->resize(nx, ny, nz, !!clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->operator()(i, j, k);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->operator()(i, j, k) = val;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  r.x = i;
+  r.y = j;
+  r.z = k;
+  return r;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  ref_set_offset(r, i);
+  return r;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref)(self, i, j, k);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref_flat)(self, i);
+  return p;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, begin)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  it.y = 0;
+  it.z = 0;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, end)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  it.y = 0;
+  it.z = static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_z();
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, get)(CFP_REF_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, set)(CFP_REF_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, copy)(CFP_REF_TYPE self, CFP_REF_TYPE src)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z) =
+    static_cast<const ZFP_ARRAY_TYPE*>(src.array.object)->operator()(src.x, src.y, src.z);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, lt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) < ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, gt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) > ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, leq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) <= ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, geq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) >= ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x == rhs.reference.x &&
+         lhs.reference.y == rhs.reference.y &&
+         lhs.reference.z == rhs.reference.z;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, neq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, distance)(CFP_PTR_TYPE first, CFP_PTR_TYPE last)
+{
+  return ref_offset(last.reference) - ref_offset(first.reference);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  ref_set_offset(p.reference, ref_offset(p.reference) + d);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, prev)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, -d);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, inc)(CFP_PTR_TYPE p)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, +1);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, dec)(CFP_PTR_TYPE p)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, -1);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get)(CFP_PTR_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set)(CFP_PTR_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set_at)(CFP_PTR_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, ref_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return self.reference;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, lt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) < iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, gt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) > iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, leq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) <= iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, geq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) >= iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object &&
+         lhs.x == rhs.x &&
+         lhs.y == rhs.y &&
+         lhs.z == rhs.z;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, neq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, distance)(CFP_ITER_TYPE first, CFP_ITER_TYPE last)
+{
+  return iter_offset(last) - iter_offset(first);
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  iter_set_offset(it, iter_offset(it) + d);
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, prev)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  iter_set_offset(it, iter_offset(it) - d);
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, inc)(CFP_ITER_TYPE it)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(it.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t m = ~size_t(3);
+  ++it.x;
+  if (!(it.x & 3u) || it.x == xmax) {
+    it.x = std::max((it.x - 1) & m, xmin);
+    ++it.y;
+    if (!(it.y & 3u) || it.y == ymax) {
+      it.y = std::max((it.y - 1) & m, ymin);
+      ++it.z;
+      if (!(it.z & 3u) || it.z == zmax) {
+        it.z = std::max((it.z - 1) & m, zmin);
+        // done with block; advance to next
+        it.x = (it.x + 4) & m;
+        if (it.x >= xmax) {
+          it.x = xmin;
+          it.y = (it.y + 4) & m;
+          if (it.y >= ymax) {
+            it.y = ymin;
+            it.z = (it.z + 4) & m;
+            if (it.z >= zmax)
+              it.z = zmax;
+          }
+        }
+      }
+    }
+  }
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, dec)(CFP_ITER_TYPE it)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(it.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t m = ~size_t(3);
+  if (it.z == zmax) {
+    it.x = xmax - 1;
+    it.y = ymax - 1;
+    it.z = zmax - 1;
+  }
+  else {
+    if (!(it.x & 3u) || it.x == xmin) {
+      it.x = std::min((it.x + 4) & m, xmax);
+      if (!(it.y & 3u) || it.y == ymin) {
+        it.y = std::min((it.y + 4) & m, ymax);
+        if (!(it.z & 3u) || it.z == zmin) {
+          it.z = std::min((it.z + 4) & m, zmax);
+          // done with block; advance to next
+          it.x = (it.x - 1) & m;
+          if (it.x <= xmin) {
+            it.x = xmax;
+            it.y = (it.y - 1) & m;
+            if (it.y <= ymin) {
+              it.y = ymax;
+              it.z = (it.z - 1) & m;
+              if (it.z <= zmin)
+                it.z = zmin;
+            }
+          }
+        }
+        --it.z;
+      }
+      --it.y;
+    }
+    --it.x;
+  }
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get)(CFP_ITER_TYPE self)
+{
+  return static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set)(CFP_ITER_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set_at)(CFP_ITER_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x, self.y, self.z);
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x, self.y, self.z);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x, self.y, self.z);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x, self.y, self.z);
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, i)(CFP_ITER_TYPE self)
+{
+  return self.x;
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, j)(CFP_ITER_TYPE self)
+{
+  return self.y;
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, k)(CFP_ITER_TYPE self)
+{
+  return self.z;
+}
diff --git a/cfp/template/cfparray4.cpp b/cfp/template/cfparray4.cpp
new file mode 100644
index 00000000..d55ef47f
--- /dev/null
+++ b/cfp/template/cfparray4.cpp
@@ -0,0 +1,576 @@
+// utility function: compute onedimensional offset from multidimensional index
+static ptrdiff_t
+ref_offset(const CFP_REF_TYPE& self)
+{
+  size_t nx = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_x();
+  size_t ny = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_y();
+  size_t nz = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_z();
+  return static_cast<ptrdiff_t>(self.x + nx * (self.y + ny * (self.z + nz * self.w)));
+}
+
+// utility function: compute multidimensional index from onedimensional offset
+static void
+ref_set_offset(CFP_REF_TYPE& self, size_t offset)
+{
+  size_t nx = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_x();
+  size_t ny = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_y();
+  size_t nz = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->size_z();
+  self.x = offset % nx; offset /= nx;
+  self.y = offset % ny; offset /= ny;
+  self.z = offset % nz; offset /= nz;
+  self.w = offset;
+}
+
+// utility function: compute onedimensional offset from multidimensional index
+static ptrdiff_t
+iter_offset(const CFP_ITER_TYPE& self)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t wmin = 0;
+  size_t wmax = container->size_w();
+  size_t nx = xmax - xmin;
+  size_t ny = ymax - ymin;
+  size_t nz = zmax - zmin;
+  size_t nw = wmax - wmin;
+  size_t x = self.x;
+  size_t y = self.y;
+  size_t z = self.z;
+  size_t w = self.w;
+  size_t p = 0;
+  if (w == wmax)
+    p += nx * ny * nz * nw;
+  else {
+    size_t m = ~size_t(3);
+    size_t bw = std::max(w & m, wmin); size_t sw = std::min((bw + 4) & m, wmax) - bw; p += (bw - wmin) * nx * ny * nz;
+    size_t bz = std::max(z & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p += (bz - zmin) * nx * ny * sw;
+    size_t by = std::max(y & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p += (by - ymin) * nx * sz * sw;
+    size_t bx = std::max(x & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p += (bx - xmin) * sy * sz * sw;
+    p += (w - bw) * sx * sy * sz;
+    p += (z - bz) * sx * sy;
+    p += (y - by) * sx;
+    p += (x - bx);
+  }
+  return static_cast<ptrdiff_t>(p);
+}
+
+// utility function: compute multidimensional index from onedimensional offset
+static void
+iter_set_offset(CFP_ITER_TYPE& self, size_t offset)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(self.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t wmin = 0;
+  size_t wmax = container->size_w();
+  size_t nx = xmax - xmin;
+  size_t ny = ymax - ymin;
+  size_t nz = zmax - zmin;
+  size_t nw = wmax - wmin;
+  size_t p = offset;
+  size_t x, y, z, w;
+  if (p == nx * ny * nz * nw) {
+    x = xmin;
+    y = ymin;
+    z = zmin;
+    w = wmax;
+  }
+  else {
+    size_t m = ~size_t(3);
+    size_t bw = std::max((wmin + p / (nx * ny * nz)) & m, wmin); size_t sw = std::min((bw + 4) & m, wmax) - bw; p -= (bw - wmin) * nx * ny * nz;
+    size_t bz = std::max((zmin + p / (nx * ny * sw)) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny * sw;
+    size_t by = std::max((ymin + p / (nx * sz * sw)) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz * sw;
+    size_t bx = std::max((xmin + p / (sy * sz * sw)) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz * sw;
+    w = bw + p / (sx * sy * sz); p -= (w - bw) * sx * sy * sz;
+    z = bz + p / (sx * sy);      p -= (z - bz) * sx * sy;
+    y = by + p / sx;             p -= (y - by) * sx;
+    x = bx + p;                  p -= (x - bx);
+  }
+  self.x = x;
+  self.y = y;
+  self.z = z;
+  self.w = w;
+}
+
+static CFP_ARRAY_TYPE
+_t1(CFP_ARRAY_TYPE, ctor)(size_t nx, size_t ny, size_t nz, size_t nw, double rate, const ZFP_SCALAR_TYPE* p, size_t cache_size)
+{
+  CFP_ARRAY_TYPE a;
+  a.object = new ZFP_ARRAY_TYPE(nx, ny, nz, nw, rate, p, cache_size);
+  return a;
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_x)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_x();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_y)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_y();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_z)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_z();
+}
+
+static size_t
+_t1(CFP_ARRAY_TYPE, size_w)(CFP_ARRAY_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_w();
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, resize)(CFP_ARRAY_TYPE self, size_t nx, size_t ny, size_t nz, size_t nw, zfp_bool clear)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->resize(nx, ny, nz, nw, !!clear);
+}
+
+static ZFP_SCALAR_TYPE
+_t1(CFP_ARRAY_TYPE, get)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k, size_t l)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.object)->operator()(i, j, k, l);
+}
+
+static void
+_t1(CFP_ARRAY_TYPE, set)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k, size_t l, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.object)->operator()(i, j, k, l) = val;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k, size_t l)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  r.x = i;
+  r.y = j;
+  r.z = k;
+  r.w = l;
+  return r;
+}
+
+static CFP_REF_TYPE
+_t1(CFP_ARRAY_TYPE, ref_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_REF_TYPE r;
+  r.array = self;
+  ref_set_offset(r, i);
+  return r;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr)(CFP_ARRAY_TYPE self, size_t i, size_t j, size_t k, size_t l)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref)(self, i, j, k, l);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t1(CFP_ARRAY_TYPE, ptr_flat)(CFP_ARRAY_TYPE self, size_t i)
+{
+  CFP_PTR_TYPE p;
+  p.reference = _t1(CFP_ARRAY_TYPE, ref_flat)(self, i);
+  return p;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, begin)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  it.y = 0;
+  it.z = 0;
+  it.w = 0;
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t1(CFP_ARRAY_TYPE, end)(CFP_ARRAY_TYPE self)
+{
+  CFP_ITER_TYPE it;
+  it.array = self;
+  it.x = 0;
+  it.y = 0;
+  it.z = 0;
+  it.w = static_cast<const ZFP_ARRAY_TYPE*>(self.object)->size_w();
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, get)(CFP_REF_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, set)(CFP_REF_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_REF_TYPE, copy)(CFP_REF_TYPE self, CFP_REF_TYPE src)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w) =
+    static_cast<const ZFP_ARRAY_TYPE*>(src.array.object)->operator()(src.x, src.y, src.z, src.w);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, lt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) < ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, gt)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) > ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, leq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) <= ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, geq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object && ref_offset(lhs.reference) >= ref_offset(rhs.reference);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return lhs.reference.array.object == rhs.reference.array.object &&
+         lhs.reference.x == rhs.reference.x &&
+         lhs.reference.y == rhs.reference.y &&
+         lhs.reference.z == rhs.reference.z &&
+         lhs.reference.w == rhs.reference.w;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, neq)(CFP_PTR_TYPE lhs, CFP_PTR_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, distance)(CFP_PTR_TYPE first, CFP_PTR_TYPE last)
+{
+  return ref_offset(last.reference) - ref_offset(first.reference);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  ref_set_offset(p.reference, ref_offset(p.reference) + d);
+  return p;
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, prev)(CFP_PTR_TYPE p, ptrdiff_t d)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, -d);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, inc)(CFP_PTR_TYPE p)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, +1);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, dec)(CFP_PTR_TYPE p)
+{
+  return _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(p, -1);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get)(CFP_PTR_TYPE self)
+{
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z, self.reference.w);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, get_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return static_cast<const ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z, self.reference.w);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set)(CFP_PTR_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z, self.reference.w) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, set_at)(CFP_PTR_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.reference.array.object)->operator()(self.reference.x, self.reference.y, self.reference.z, self.reference.w) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, ref_at)(CFP_PTR_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_PTR_TYPE, next)(self, d);
+  return self.reference;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, lt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) < iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, gt)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) > iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, leq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) <= iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, geq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object && iter_offset(lhs) >= iter_offset(rhs);
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return lhs.array.object == rhs.array.object &&
+         lhs.x == rhs.x &&
+         lhs.y == rhs.y &&
+         lhs.z == rhs.z &&
+         lhs.w == rhs.w;
+}
+
+static zfp_bool
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, neq)(CFP_ITER_TYPE lhs, CFP_ITER_TYPE rhs)
+{
+  return !_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, eq)(lhs, rhs);
+}
+
+static ptrdiff_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, distance)(CFP_ITER_TYPE first, CFP_ITER_TYPE last)
+{
+  return iter_offset(last) - iter_offset(first);
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  iter_set_offset(it, iter_offset(it) + d);
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, prev)(CFP_ITER_TYPE it, ptrdiff_t d)
+{
+  iter_set_offset(it, iter_offset(it) - d);
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, inc)(CFP_ITER_TYPE it)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(it.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t wmin = 0;
+  size_t wmax = container->size_w();
+  size_t m = ~size_t(3);
+  ++it.x;
+  if (!(it.x & 3u) || it.x == xmax) {
+    it.x = std::max((it.x - 1) & m, xmin);
+    ++it.y;
+    if (!(it.y & 3u) || it.y == ymax) {
+      it.y = std::max((it.y - 1) & m, ymin);
+      ++it.z;
+      if (!(it.z & 3u) || it.z == zmax) {
+        it.z = std::max((it.z - 1) & m, zmin);
+        ++it.w;
+        if (!(it.w & 3u) || it.w == wmax) {
+          it.w = std::max((it.w - 1) & m, wmin);
+          // done with block; advance to next
+          it.x = (it.x + 4) & m;
+          if (it.x >= xmax) {
+            it.x = xmin;
+            it.y = (it.y + 4) & m;
+            if (it.y >= ymax) {
+              it.y = ymin;
+              it.z = (it.z + 4) & m;
+              if (it.z >= zmax) {
+                it.z = zmin;
+                it.w = (it.w + 4) & m;
+                if (it.w >= wmax)
+                  it.w = wmax;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return it;
+}
+
+static CFP_ITER_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, dec)(CFP_ITER_TYPE it)
+{
+  const ZFP_ARRAY_TYPE* container = static_cast<const ZFP_ARRAY_TYPE*>(it.array.object);
+  size_t xmin = 0;
+  size_t xmax = container->size_x();
+  size_t ymin = 0;
+  size_t ymax = container->size_y();
+  size_t zmin = 0;
+  size_t zmax = container->size_z();
+  size_t wmin = 0;
+  size_t wmax = container->size_w();
+  size_t m = ~size_t(3);
+  if (it.w == wmax) {
+    it.x = xmax - 1;
+    it.y = ymax - 1;
+    it.z = zmax - 1;
+    it.w = wmax - 1;
+  }
+  else {
+    if (!(it.x & 3u) || it.x == xmin) {
+      it.x = std::min((it.x + 4) & m, xmax);
+      if (!(it.y & 3u) || it.y == ymin) {
+        it.y = std::min((it.y + 4) & m, ymax);
+        if (!(it.z & 3u) || it.z == zmin) {
+          it.z = std::min((it.z + 4) & m, zmax);
+          if (!(it.w & 3u) || it.w == wmin) {
+            it.w = std::min((it.w + 4) & m, wmax);
+            // done with block; advance to next
+            it.x = (it.x - 1) & m;
+            if (it.x <= xmin) {
+              it.x = xmax;
+              it.y = (it.y - 1) & m;
+              if (it.y <= ymin) {
+                it.y = ymax;
+                it.z = (it.z - 1) & m;
+                if (it.z <= zmin) {
+                  it.z = zmax;
+                  it.w = (it.w - 1) & m;
+                  if (it.w <= wmin)
+                    it.w = wmin;
+                }
+              }
+            }
+          }
+          --it.w;
+        }
+        --it.z;
+      }
+      --it.y;
+    }
+    --it.x;
+  }
+  return it;
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get)(CFP_ITER_TYPE self)
+{
+  return static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w);
+}
+
+static ZFP_SCALAR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, get_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w);
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set)(CFP_ITER_TYPE self, ZFP_SCALAR_TYPE val)
+{
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w) = val;
+}
+
+static void
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, set_at)(CFP_ITER_TYPE self, ptrdiff_t d, ZFP_SCALAR_TYPE val)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  static_cast<ZFP_ARRAY_TYPE*>(self.array.object)->operator()(self.x, self.y, self.z, self.w) = val;
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x, self.y, self.z, self.w);
+}
+
+static CFP_REF_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ref_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ref)(self.array, self.x, self.y, self.z, self.w);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr)(CFP_ITER_TYPE self)
+{
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x, self.y, self.z, self.w);
+}
+
+static CFP_PTR_TYPE
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, ptr_at)(CFP_ITER_TYPE self, ptrdiff_t d)
+{
+  self = _t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, next)(self, d);
+  return _t1(CFP_ARRAY_TYPE, ptr)(self.array, self.x, self.y, self.z, self.w);
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, i)(CFP_ITER_TYPE self)
+{
+  return self.x;
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, j)(CFP_ITER_TYPE self)
+{
+  return self.y;
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, k)(CFP_ITER_TYPE self)
+{
+  return self.z;
+}
+
+static size_t
+_t2(CFP_ARRAY_TYPE, CFP_ITER_TYPE, l)(CFP_ITER_TYPE self)
+{
+  return self.w;
+}
diff --git a/cfp/template/cfpheader.cpp b/cfp/template/cfpheader.cpp
new file mode 100644
index 00000000..b9f61917
--- /dev/null
+++ b/cfp/template/cfpheader.cpp
@@ -0,0 +1,166 @@
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_buffer)(const void* data, size_t bytes)
+{
+  CFP_HEADER_TYPE h;
+  h.object = 0;
+
+  try {
+    // construct generic header and query array type
+    header hdr(data, bytes);
+    uint dims = hdr.dimensionality();
+    zfp_type scalar_type = hdr.scalar_type();
+    // construct array-specific header
+    switch (dims) {
+      case 1:
+        if (scalar_type == zfp_type_float)
+          h.object = new zfp::array1f::header(data, bytes);
+        else if (scalar_type == zfp_type_double)
+          h.object = new zfp::array1d::header(data, bytes);
+        break;
+      case 2:
+        if (scalar_type == zfp_type_float)
+          h.object = new zfp::array2f::header(data, bytes);
+        else if (scalar_type == zfp_type_double)
+          h.object = new zfp::array2d::header(data, bytes);
+        break;
+      case 3:
+        if (scalar_type == zfp_type_float)
+          h.object = new zfp::array3f::header(data, bytes);
+        else if (scalar_type == zfp_type_double)
+          h.object = new zfp::array3d::header(data, bytes);
+        break;
+      case 4:
+        if (scalar_type == zfp_type_float)
+          h.object = new zfp::array4f::header(data, bytes);
+        else if (scalar_type == zfp_type_double)
+          h.object = new zfp::array4d::header(data, bytes);
+        break;
+    }
+  }
+  catch (...) {}
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array1f)(cfp_array1f a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array1f::header(*static_cast<zfp::array1f*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array1d)(cfp_array1d a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array1d::header(*static_cast<zfp::array1d*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array2f)(cfp_array2f a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array2f::header(*static_cast<zfp::array2f*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array2d)(cfp_array2d a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array2d::header(*static_cast<zfp::array2d*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array3f)(cfp_array3f a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array3f::header(*static_cast<zfp::array3f*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array3d)(cfp_array3d a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array3d::header(*static_cast<zfp::array3d*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array4f)(cfp_array4f a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array4f::header(*static_cast<zfp::array4f*>(a.object));
+  return h;
+}
+
+static CFP_HEADER_TYPE
+_t1(CFP_HEADER_TYPE, ctor_array4d)(cfp_array4d a)
+{
+  CFP_HEADER_TYPE h;
+  h.object = new zfp::array4d::header(*static_cast<zfp::array4d*>(a.object));
+  return h;
+}
+
+static void
+_t1(CFP_HEADER_TYPE, dtor)(CFP_HEADER_TYPE self)
+{
+  delete static_cast<ZFP_HEADER_TYPE*>(self.object);
+}
+
+static zfp_type
+_t1(CFP_HEADER_TYPE, scalar_type)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->scalar_type();
+}
+
+static uint
+_t1(CFP_HEADER_TYPE, dimensionality)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->dimensionality();
+}
+
+static size_t
+_t1(CFP_HEADER_TYPE, size_x)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->size_x();
+}
+
+static size_t
+_t1(CFP_HEADER_TYPE, size_y)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->size_y();
+}
+
+static size_t
+_t1(CFP_HEADER_TYPE, size_z)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->size_z();
+}
+
+static size_t
+_t1(CFP_HEADER_TYPE, size_w)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->size_w();
+}
+
+static double
+_t1(CFP_HEADER_TYPE, rate)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->rate();
+}
+
+static const void*
+_t1(CFP_HEADER_TYPE, data)(CFP_HEADER_TYPE self)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->data();
+}
+
+static size_t
+_t1(CFP_HEADER_TYPE, size_bytes)(CFP_HEADER_TYPE self, uint mask)
+{
+  return static_cast<const ZFP_HEADER_TYPE*>(self.object)->size_bytes(mask);
+}
diff --git a/cmake/appveyor.cmake b/cmake/appveyor.cmake
index 6633b16b..1e803713 100644
--- a/cmake/appveyor.cmake
+++ b/cmake/appveyor.cmake
@@ -11,7 +11,9 @@ set(CTEST_SITE "appveyor")
 set(CTEST_CMAKE_GENERATOR "${GENERATOR}")
 set(CTEST_BUILD_NAME "$ENV{APPVEYOR_REPO_BRANCH}-${job_details}")
 set(cfg_options
+  -DCMAKE_BUILD_TYPE=$ENV{BUILD_TYPE}
   -DBUILD_CFP=${BUILD_CFP}
+  -DBUILD_ZFPY=${BUILD_ZFPY}
   -DZFP_WITH_OPENMP=${BUILD_OPENMP}
   -DZFP_WITH_CUDA=${BUILD_CUDA}
   )
@@ -46,6 +48,19 @@ if(BUILD_CFP)
   endif()
 endif()
 
+if(BUILD_ZFPY)
+  set(CTEST_SITE "${CTEST_SITE}_zfpy$ENV{PYTHON_VERSION}")
+
+  # sanitize python include dir path (ex. windows vs linux slashes)
+  set(PYTHON_INCLUDE_DIR "")
+  file(TO_CMAKE_PATH "${CTEST_SOURCE_DIRECTORY}\\$ENV{VIRTUALENV_NAME}\\Include" PYTHON_INCLUDE_DIR)
+
+  list(APPEND cfg_options
+    -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}
+    -DPYTHON_LIBRARY=$ENV{PYTHON_LIB_PATH}
+    )
+endif()
+
 if(OMP_TESTS_ONLY)
   list(APPEND cfg_options
     -DZFP_OMP_TESTS_ONLY=1
diff --git a/cmake/travis.cmake b/cmake/travis.cmake
deleted file mode 100644
index f2bf844b..00000000
--- a/cmake/travis.cmake
+++ /dev/null
@@ -1,87 +0,0 @@
-
-set(CTEST_SOURCE_DIRECTORY "$ENV{TRAVIS_BUILD_DIR}")
-set(CTEST_BINARY_DIRECTORY "$ENV{TRAVIS_BUILD_DIR}/build")
-
-set(CTEST_COMMAND ctest)
-include(${CTEST_SOURCE_DIRECTORY}/CTestConfig.cmake)
-set(CTEST_SITE "travis")
-set(CTEST_CMAKE_GENERATOR "Unix Makefiles")
-set(CTEST_BUILD_NAME "$ENV{TRAVIS_BRANCH}-#$ENV{TRAVIS_JOB_NUMBER}")
-set(cfg_options
-  -DCMAKE_C_STANDARD=${C_STANDARD}
-  -DCMAKE_CXX_STANDARD=${CXX_STANDARD}
-  -DBUILD_CFP=${BUILD_CFP}
-  -DBUILD_ZFPY=${BUILD_ZFPY}
-  -DBUILD_ZFORP=${BUILD_ZFORP}
-  -DZFP_WITH_OPENMP=${BUILD_OPENMP}
-  -DZFP_WITH_CUDA=${BUILD_CUDA}
-  )
-
-# Add the variants to the testers name so that we can report multiple
-# times from the same CI builder
-if(BUILD_OPENMP)
-  set(CTEST_SITE "${CTEST_SITE}_openmp")
-endif()
-
-if(BUILD_CUDA)
-  set(CTEST_SITE "${CTEST_SITE}_cuda")
-endif()
-
-if(BUILD_CFP)
-  set(CTEST_SITE "${CTEST_SITE}_cfp")
-
-  if(CFP_NAMESPACE)
-    list(APPEND cfg_options
-      -DCFP_NAMESPACE=${CFP_NAMESPACE}
-      )
-    set(CTEST_SITE "${CTEST_SITE}namespace")
-  endif()
-endif()
-
-if(BUILD_ZFPY)
-  set(CTEST_SITE "${CTEST_SITE}_zfpy$ENV{PYTHON_VERSION}")
-  list(APPEND cfg_options
-    -DPYTHON_INCLUDE_DIR=$ENV{PYTHON_INCLUDE_DIR}
-    -DPYTHON_LIBRARY=$ENV{PYTHON_LIBRARY}
-    -DPYTHON_EXECUTABLE=$ENV{PYTHON_EXECUTABLE}
-    )
-endif()
-
-if(BUILD_ZFORP)
-  set(CTEST_SITE "${CTEST_SITE}_zforp$ENV{FORTRAN_STANDARD}")
-  list(APPEND cfg_options
-    -DCMAKE_FORTRAN_FLAGS='-std=f$ENV{FORTRAN_STANDARD}'
-    )
-endif()
-
-if(WITH_COVERAGE)
-  list(APPEND cfg_options
-    -DCMAKE_C_FLAGS=-coverage
-    -DCMAKE_CXX_FLAGS=-coverage
-    -DCMAKE_Fortran_FLAGS=-coverage
-    )
-  set(CTEST_SITE "${CTEST_SITE}_coverage")
-endif()
-
-if(OMP_TESTS_ONLY)
-  list(APPEND cfg_options
-    -DZFP_OMP_TESTS_ONLY=1
-    )
-endif()
-
-ctest_start(Experimental TRACK Travis)
-ctest_configure(OPTIONS "${cfg_options}")
-ctest_submit(PARTS Update Notes Configure)
-ctest_build(FLAGS -j1)
-ctest_submit(PARTS Build)
-ctest_test(PARALLEL_LEVEL 6 RETURN_VALUE rv)
-ctest_submit(PARTS Test)
-
-if(WITH_COVERAGE)
-  ctest_coverage()
-  ctest_submit(PARTS Coverage)
-endif()
-
-if(NOT rv EQUAL 0)
-  message(FATAL_ERROR "Test failures occurred.")
-endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 7cc76068..73137223 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,6 +1,14 @@
+add_executable(array array.cpp)
+target_compile_definitions(array PRIVATE ${zfp_compressed_array_defs})
+target_link_libraries(array zfp)
+
 add_executable(diffusion diffusion.cpp)
-target_link_libraries(diffusion zfp)
 target_compile_definitions(diffusion PRIVATE ${zfp_compressed_array_defs})
+if(ZFP_WITH_OPENMP)
+  target_link_libraries(diffusion zfp OpenMP::OpenMP_CXX)
+else()
+  target_link_libraries(diffusion zfp)
+endif()
 
 if(BUILD_CFP)
   add_executable(diffusionC diffusionC.c)
@@ -14,9 +22,18 @@ add_executable(iterator iterator.cpp)
 target_link_libraries(iterator zfp)
 target_compile_definitions(iterator PRIVATE ${zfp_compressed_array_defs})
 
+if(BUILD_CFP)
+  add_executable(iteratorC iteratorC.c)
+  target_link_libraries(iteratorC cfp)
+endif()
+
 add_executable(pgm pgm.c)
 target_link_libraries(pgm zfp)
 
+add_executable(ppm ppm.c)
+target_link_libraries(ppm zfp)
+target_compile_definitions(ppm PRIVATE ${ppm_private_defs})
+
 add_executable(simple simple.c)
 target_link_libraries(simple zfp)
 
@@ -24,6 +41,7 @@ add_executable(speed speed.c)
 target_link_libraries(speed zfp)
 
 if(HAVE_LIBM_MATH)
+  target_link_libraries(array m)
   target_link_libraries(diffusion m)
 
   if(BUILD_CFP)
@@ -32,5 +50,6 @@ if(HAVE_LIBM_MATH)
 
   target_link_libraries(inplace m)
   target_link_libraries(pgm m)
+  target_link_libraries(ppm m)
   target_link_libraries(simple m)
 endif()
diff --git a/examples/Makefile b/examples/Makefile
index bb44b1e1..0e288544 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,35 +1,62 @@
 include ../Config
 
 BINDIR = ../bin
-TARGETS = $(BINDIR)/diffusion\
+TARGETS = $(BINDIR)/array\
+	  $(BINDIR)/diffusion\
 	  $(BINDIR)/inplace\
 	  $(BINDIR)/iterator\
 	  $(BINDIR)/pgm\
+	  $(BINDIR)/ppm\
 	  $(BINDIR)/simple\
 	  $(BINDIR)/speed
+INCS = -I../include
 LIBS = -L../lib -lzfp
-CLIBS = $(LIBS) -lm
-CXXLIBS = $(LIBS)
+CLIBS = $(LIBS) $(LDFLAGS) -lm
+CXXLIBS = $(LIBS) $(LDFLAGS)
+
+# add cfp examples when BUILD_CFP is enabled
+ifneq ($(BUILD_CFP),0)
+  TARGETS += $(BINDIR)/diffusionC $(BINDIR)/iteratorC
+endif
+
 
 all: $(TARGETS)
 
+$(BINDIR)/array: array.cpp ../lib/$(LIBZFP)
+	$(CXX) $(CXXFLAGS) $(INCS) array.cpp $(CXXLIBS) -o $@
+
 $(BINDIR)/diffusion: diffusion.cpp ../lib/$(LIBZFP)
-	$(CXX) $(CXXFLAGS) -I../array diffusion.cpp $(CXXLIBS) -o $@
+	$(CXX) $(CXXFLAGS) $(INCS) diffusion.cpp $(CXXLIBS) -o $@
+
+$(BINDIR)/diffusionC: diffusionC.o ../lib/$(LIBZFP) ../lib/$(LIBCFP)
+	$(CXX) $(CXXFLAGS) diffusionC.o -lcfp $(CLIBS) -o $@
+
+diffusionC.o: diffusionC.c
+	$(CC) $(CFLAGS) $(INCS) -c diffusionC.c
 
 $(BINDIR)/inplace: inplace.c ../lib/$(LIBZFP)
-	$(CC) $(CFLAGS) inplace.c $(CLIBS) -o $@
+	$(CC) $(CFLAGS) $(INCS) inplace.c $(CLIBS) -o $@
 
 $(BINDIR)/iterator: iterator.cpp ../lib/$(LIBZFP)
-	$(CXX) $(CXXFLAGS) -I../array iterator.cpp $(CXXLIBS) -o $@
+	$(CXX) $(CXXFLAGS) $(INCS) iterator.cpp $(CXXLIBS) -o $@
+
+$(BINDIR)/iteratorC: iteratorC.o ../lib/$(LIBZFP) ../lib/$(LIBCFP)
+	$(CXX) $(CXXFLAGS) iteratorC.o -lcfp $(CLIBS) -o $@
+
+iteratorC.o: iteratorC.c
+	$(CC) $(CFLAGS) $(INCS) -c iteratorC.c
 
 $(BINDIR)/pgm: pgm.c ../lib/$(LIBZFP)
-	$(CC) $(CFLAGS) pgm.c $(CLIBS) -o $@
+	$(CC) $(CFLAGS) $(INCS) pgm.c $(CLIBS) -o $@
+
+$(BINDIR)/ppm: ppm.c ../lib/$(LIBZFP)
+	$(CC) $(CFLAGS) $(PPM_FLAGS) $(INCS) ppm.c $(CLIBS) -o $@
 
 $(BINDIR)/simple: simple.c ../lib/$(LIBZFP)
-	$(CC) $(CFLAGS) simple.c $(CLIBS) -o $@
+	$(CC) $(CFLAGS) $(INCS) simple.c $(CLIBS) -o $@
 
 $(BINDIR)/speed: speed.c ../lib/$(LIBZFP)
-	$(CC) $(CFLAGS) speed.c $(CLIBS) -o $@
+	$(CC) $(CFLAGS) $(INCS) speed.c $(CLIBS) -o $@
 
 clean:
-	rm -f $(TARGETS)
+	rm -f $(TARGETS) $(BINDIR)/diffusionC $(BINDIR)/iteratorC diffusionC.o iteratorC.o
diff --git a/examples/array.cpp b/examples/array.cpp
new file mode 100644
index 00000000..233cb36e
--- /dev/null
+++ b/examples/array.cpp
@@ -0,0 +1,42 @@
+// simple example that shows how to work with zfp's compressed-array classes
+
+#include <iostream>
+#include <vector>
+#include "zfp/array2.hpp"
+
+int main()
+{
+  // array dimensions (can be arbitrary) and zfp memory footprint
+  const size_t nx = 12;
+  const size_t ny = 8;
+  const double bits_per_value = 4.0;
+
+  // declare 2D arrays using STL and zfp
+  std::vector<double> vec(nx * ny);
+  zfp::array2<double> arr(nx, ny, bits_per_value);
+
+  // initialize arrays to linear ramp
+  for (size_t y = 0; y < ny; y++)
+    for (size_t x = 0; x < nx; x++)
+      arr(x, y) = vec[x + nx * y] = x + nx * y;
+
+  // alternative initialization of entire array, arr:
+  // arr.set(&vec[0]);
+
+  // optional: force compression of cached data
+  arr.flush_cache();
+
+  // print values
+  for (size_t y = 0; y < ny; y++)
+    for (size_t x = 0; x < nx; x++)
+      std::cout << vec[x + nx * y] << " " << arr(x, y) << std::endl;
+
+  // alternative using printf(); note the necessary cast:
+  // printf("%g %g\n", vec[x + nx * y], (double)arr(x, y));
+
+  // print storage size of payload data
+  std::cout << "vec bytes = " << vec.capacity() * sizeof(vec[0]) << std::endl;
+  std::cout << "zfp bytes = " << arr.size_bytes(ZFP_DATA_PAYLOAD) << std::endl;
+
+  return 0;
+}
diff --git a/examples/array2d.h b/examples/array2d.h
deleted file mode 100644
index 8ba7291c..00000000
--- a/examples/array2d.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef ARRAY2D_H
-#define ARRAY2D_H
-
-#include <climits>
-#include <vector>
-
-#define unused_(x) ((void)(x))
-
-typedef unsigned int uint;
-
-// uncompressed 2D double-precision array (for comparison)
-namespace raw {
-class array2d {
-public:
-  array2d() : nx(0), ny(0) {}
-  array2d(uint nx, uint ny, double rate = 0.0, const double* p = 0, size_t csize = 0) : nx(nx), ny(ny), data(nx * ny, 0.0)
-  {
-    unused_(rate);
-    unused_(p);
-    unused_(csize);
-  }
-  void resize(uint nx, uint ny) { this->nx = nx; this->ny = ny; data.resize(nx * ny, 0.0); }
-  size_t size() const { return data.size(); }
-  size_t size_x() const { return nx; }
-  size_t size_y() const { return ny; }
-  double rate() const { return CHAR_BIT * sizeof(double); }
-  size_t cache_size() const { return 0; }
-  double& operator()(uint x, uint y) { return data[x + nx * y]; }
-  const double& operator()(uint x, uint y) const { return data[x + nx * y]; }
-  double& operator[](uint i) { return data[i]; }
-  const double& operator[](uint i) const { return data[i]; }
-  class iterator {
-  public:
-    double& operator*() const { return array->operator[](index); }
-    iterator& operator++() { index++; return *this; }
-    iterator operator++(int) { iterator p = *this; index++; return p; }
-    bool operator==(const iterator& it) const { return array == it.array && index == it.index; }
-    bool operator!=(const iterator& it) const { return !operator==(it); }
-    uint i() const { return index % array->nx; }
-    uint j() const { return index / array->nx; }
-  protected:
-    friend class array2d;
-    iterator(array2d* array, uint index) : array(array), index(index) {}
-    array2d* array;
-    uint index;
-  };
-  iterator begin() { return iterator(this, 0); }
-  iterator end() { return iterator(this, nx * ny); }
-protected:
-  uint nx;
-  uint ny;
-  std::vector<double> data;
-};
-}
-
-#undef unused_
-
-#endif
diff --git a/examples/array2d.hpp b/examples/array2d.hpp
new file mode 100644
index 00000000..c349328b
--- /dev/null
+++ b/examples/array2d.hpp
@@ -0,0 +1,72 @@
+#ifndef ARRAY2D_HPP
+#define ARRAY2D_HPP
+
+#include <climits>
+#include <vector>
+
+typedef unsigned int uint;
+
+// uncompressed 2D double-precision array (for comparison)
+namespace raw {
+class array2d {
+public:
+  // constructors
+  array2d() : nx(0), ny(0) {}
+  array2d(size_t nx, size_t ny, double = 0.0, const double* = 0, size_t = 0) : nx(nx), ny(ny), data(nx * ny, 0.0) {}
+
+  // array size
+  size_t size() const { return data.size(); }
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  void resize(size_t nx, size_t ny) { this->nx = nx; this->ny = ny; data.resize(nx * ny, 0.0); }
+
+  // rate in bits/value
+  double rate() const { return CHAR_BIT * sizeof(double); }
+
+  // cache size in bytes
+  size_t cache_size() const { return 0; }
+
+  // byte size of data structures
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    if (mask & ZFP_DATA_PAYLOAD)
+      size += data.size() * sizeof(double);
+    return size;
+  }
+
+  // accessors
+  double& operator()(size_t x, size_t y) { return data[x + nx * y]; }
+  const double& operator()(size_t x, size_t y) const { return data[x + nx * y]; }
+  double& operator[](size_t index) { return data[index]; }
+  const double& operator[](size_t index) const { return data[index]; }
+
+  // minimal-functionality forward iterator
+  class iterator {
+  public:
+    double& operator*() const { return array->operator[](index); }
+    iterator& operator++() { index++; return *this; }
+    iterator operator++(int) { iterator p = *this; index++; return p; }
+    bool operator==(const iterator& it) const { return array == it.array && index == it.index; }
+    bool operator!=(const iterator& it) const { return !operator==(it); }
+    size_t i() const { return index % array->nx; }
+    size_t j() const { return index / array->nx; }
+  protected:
+    friend class array2d;
+    iterator(array2d* array, size_t index) : array(array), index(index) {}
+    array2d* array;
+    size_t index;
+  };
+
+  iterator begin() { return iterator(this, 0); }
+  iterator end() { return iterator(this, nx * ny); }
+
+protected:
+  size_t nx, ny;
+  std::vector<double> data;
+};
+}
+
+#endif
diff --git a/examples/diffusion.cpp b/examples/diffusion.cpp
index 82cc109d..a62f191e 100644
--- a/examples/diffusion.cpp
+++ b/examples/diffusion.cpp
@@ -6,19 +6,46 @@
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
-#include "zfparray2.h"
-#include "array2d.h"
+#include <sstream>
+#include "zfp/array2.hpp"
+#include "zfp/constarray2.hpp"
+#include "zfp/codec/gencodec.hpp"
+#include "array2d.hpp"
+
+// add half precision if compiler supports it
+#define __STDC_WANT_IEC_60559_TYPES_EXT__
+#include <cfloat>
+#ifdef FLT16_MAX
+  #define WITH_HALF 1
+#else
+  #undef WITH_HALF
+#endif
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
-#define unused_(x) ((void)(x))
+// uncompressed tiled arrays based on zfp generic codec
+namespace tiled {
+#if WITH_HALF
+  typedef zfp::array2< double, zfp::codec::generic2<double, _Float16> > array2h;
+#endif
+  typedef zfp::array2< double, zfp::codec::generic2<double, float> > array2f;
+  typedef zfp::array2< double, zfp::codec::generic2<double, double> > array2d;
+}
+
+// enumeration of uncompressed storage types
+enum storage_type {
+  type_none = 0,
+  type_half = 1,
+  type_float = 2,
+  type_double = 3
+};
 
 // constants used in the solution
 class Constants {
 public:
-  Constants(int nx, int ny, int nt) :
+  Constants(size_t nx, size_t ny, size_t nt) :
     nx(nx),
     ny(ny),
     nt(nt),
@@ -32,11 +59,11 @@ class Constants {
     pi(3.14159265358979323846)
   {}
 
-  int nx;        // grid points in x
-  int ny;        // grid points in y
-  int nt;        // number of time steps (0 for default)
-  int x0;        // x location of heat source
-  int y0;        // y location of heat source
+  size_t nx;     // grid points in x
+  size_t ny;     // grid points in y
+  size_t nt;     // number of time steps (0 for default)
+  size_t x0;     // x location of heat source
+  size_t y0;     // y location of heat source
   double k;      // diffusion constant
   double dx;     // grid spacing in x
   double dy;     // grid spacing in y
@@ -45,20 +72,31 @@ class Constants {
   double pi;     // 3.141...
 };
 
+// compute Laplacian uxx + uyy at (x, y)
 template <class array2d>
+inline double
+laplacian(const array2d& u, size_t x, size_t y, const Constants& c)
+{
+  double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
+  double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
+  return uxx + uyy;
+}
+
+template <class state, class scratch>
 inline void
-time_step_parallel(array2d& u, const Constants& c);
+time_step_parallel(state& u, scratch& v, const Constants& c);
 
+#ifdef _OPENMP
 // advance solution in parallel via thread-safe views
 template <>
 inline void
-time_step_parallel(zfp::array2d& u, const Constants& c)
+time_step_parallel(zfp::array2d& u, zfp::array2d& du, const Constants& c)
 {
-#ifdef _OPENMP
   // flush shared cache to ensure cache consistency across threads
   u.flush_cache();
+  // zero-initialize du
+  du.set(0);
   // compute du/dt in parallel
-  zfp::array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
   #pragma omp parallel
   {
     // create read-only private view of entire array u
@@ -67,127 +105,177 @@ time_step_parallel(zfp::array2d& u, const Constants& c)
     zfp::array2d::private_view mydu(&du);
     mydu.partition(omp_get_thread_num(), omp_get_num_threads());
     // process rectangular region owned by this thread
-    for (uint j = 0; j < mydu.size_y(); j++) {
-      int y = mydu.global_y(j);
+    for (size_t j = 0; j < mydu.size_y(); j++) {
+      size_t y = mydu.global_y(j);
       if (1 <= y && y <= c.ny - 2)
-        for (uint i = 0; i < mydu.size_x(); i++) {
-          int x = mydu.global_x(i);
-          if (1 <= x && x <= c.nx - 2) {
-            double uxx = (myu(x - 1, y) - 2 * myu(x, y) + myu(x + 1, y)) / (c.dx * c.dx);
-            double uyy = (myu(x, y - 1) - 2 * myu(x, y) + myu(x, y + 1)) / (c.dy * c.dy);
-            mydu(i, j) = c.dt * c.k * (uxx + uyy);
-          }
+        for (size_t i = 0; i < mydu.size_x(); i++) {
+          size_t x = mydu.global_x(i);
+          if (1 <= x && x <= c.nx - 2)
+            mydu(i, j) = c.dt * c.k * laplacian(myu, x, y, c);
         }
     }
     // compress all private cached blocks to shared storage
     mydu.flush_cache();
   }
   // take forward Euler step in serial
-  for (uint i = 0; i < u.size(); i++)
+  for (size_t i = 0; i < u.size(); i++)
     u[i] += du[i];
+}
 #else
-  unused_(u);
-  unused_(c);
+// dummy template instantiation when OpenMP support is not available
+template <>
+inline void time_step_parallel(zfp::array2d&, zfp::array2d&, const Constants&) {}
+#endif
+
+// dummy template instantiations; never executed
+template <>
+inline void time_step_parallel(zfp::const_array2d&, raw::array2d&, const Constants&) {}
+template <>
+inline void time_step_parallel(raw::array2d&, raw::array2d&, const Constants&) {}
+template <>
+inline void time_step_parallel(tiled::array2d&, tiled::array2d&, const Constants&) {}
+template <>
+inline void time_step_parallel(tiled::array2f&, tiled::array2f&, const Constants&) {}
+#if WITH_HALF
+template <>
+inline void time_step_parallel(tiled::array2h&, tiled::array2h&, const Constants&) {}
 #endif
+
+// advance solution using integer array indices (generic implementation)
+template <class state, class scratch>
+inline void
+time_step_indexed(state& u, scratch& du, const Constants& c)
+{
+  // compute du/dt
+  for (size_t y = 1; y < c.ny - 1; y++)
+    for (size_t x = 1; x < c.nx - 1; x++)
+      du(x, y) = c.dt * c.k * laplacian(u, x, y, c);
+  // take forward Euler step
+  for (uint i = 0; i < u.size(); i++)
+    u[i] += du[i];
 }
 
-// dummy template instantiation; never executed
+// advance solution using integer array indices (read-only arrays)
 template <>
 inline void
-time_step_parallel(raw::array2d& u, const Constants& c)
+time_step_indexed(zfp::const_array2d& u, raw::array2d& v, const Constants& c)
 {
-  unused_(u);
-  unused_(c);
+  // initialize v as uncompressed copy of u
+  u.get(&v[0]);
+  // take forward Euler step v += (du/dt) dt
+  for (size_t y = 1; y < c.ny - 1; y++)
+    for (size_t x = 1; x < c.nx - 1; x++)
+      v(x, y) += c.dt * c.k * laplacian(u, x, y, c);
+  // update u with uncompressed copy v
+  u.set(&v[0]);
 }
 
-// advance solution using integer array indices
-template <class array2d>
+// advance solution using array iterators (generic implementation)
+template <class state, class scratch>
 inline void
-time_step_indexed(array2d& u, const Constants& c)
+time_step_iterated(state& u, scratch& du, const Constants& c)
 {
   // compute du/dt
-  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
-  for (int y = 1; y < c.ny - 1; y++) {
-    for (int x = 1; x < c.nx - 1; x++) {
-      double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
-      double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
-      du(x, y) = c.dt * c.k * (uxx + uyy);
-    }
+  for (typename scratch::iterator q = du.begin(); q != du.end(); q++) {
+    size_t x = q.i();
+    size_t y = q.j();
+    if (1 <= x && x <= c.nx - 2 &&
+        1 <= y && y <= c.ny - 2)
+      *q = c.dt * c.k * laplacian(u, x, y, c);
   }
   // take forward Euler step
-  for (uint i = 0; i < u.size(); i++)
-    u[i] += du[i];
+  for (typename state::iterator p = u.begin(); p != u.end(); p++)
+    *p += du(p.i(), p.j());
 }
 
-// advance solution using array iterators
-template <class array2d>
+// advance solution using array iterators (read-only arrays)
+template <>
 inline void
-time_step_iterated(array2d& u, const Constants& c)
+time_step_iterated(zfp::const_array2d& u, raw::array2d& v, const Constants& c)
 {
-  // compute du/dt
-  array2d du(c.nx, c.ny, u.rate(), 0, u.cache_size());
-  for (typename array2d::iterator p = du.begin(); p != du.end(); p++) {
-    int x = p.i();
-    int y = p.j();
+  // initialize v as uncompressed copy of u
+  u.get(&v[0]);
+  // take forward Euler step v += (du/dt) dt
+  for (raw::array2d::iterator q = v.begin(); q != v.end(); q++) {
+    size_t x = q.i();
+    size_t y = q.j();
     if (1 <= x && x <= c.nx - 2 &&
-        1 <= y && y <= c.ny - 2) {
-      double uxx = (u(x - 1, y) - 2 * u(x, y) + u(x + 1, y)) / (c.dx * c.dx);
-      double uyy = (u(x, y - 1) - 2 * u(x, y) + u(x, y + 1)) / (c.dy * c.dy);
-      *p = c.dt * c.k * (uxx + uyy);
-    }
+        1 <= y && y <= c.ny - 2)
+      *q += c.dt * c.k * laplacian(u, x, y, c);
   }
-  // take forward Euler step
-  for (typename array2d::iterator p = u.begin(), q = du.begin(); p != u.end(); p++, q++)
-    *p += *q;
+  // update u with uncompressed copy v
+  u.set(&v[0]);
 }
 
-// solve heat equation using 
-template <class array2d>
-inline double
-solve(array2d& u, const Constants& c, bool iterator, bool parallel)
+// set initial conditions with a point heat source (u is assumed zero-initialized)
+template <class state, class scratch>
+inline void
+initialize(state& u, scratch&, const Constants& c)
 {
-  // initialize u with point heat source (u is assumed to be zero initialized)
   u(c.x0, c.y0) = 1;
+}
+
+// set initial conditions for const_array; requires updating the whole array
+template <>
+inline void
+initialize(zfp::const_array2d& u, raw::array2d& v, const Constants& c)
+{
+  v(c.x0, c.y0) = 1;
+  u.set(&v[0]);
+}
+
+// solve heat equation
+template <class state, class scratch>
+inline double
+solve(state& u, scratch& v, const Constants& c, bool iterator, bool parallel)
+{
+  // initialize u with point heat source
+  initialize(u, v, c);
 
   // iterate until final time
   double t;
   for (t = 0; t < c.tfinal; t += c.dt) {
-    std::cerr << "t=" << std::setprecision(6) << std::fixed << t << std::endl;
+    // print time and effective rate
+    double rate = double(u.size_bytes(ZFP_DATA_PAYLOAD)) * CHAR_BIT / u.size();
+    double rest = double(u.size_bytes(ZFP_DATA_ALL ^ ZFP_DATA_PAYLOAD) * CHAR_BIT / u.size());
+    std::cerr << "time=" << std::setprecision(6) << std::fixed << t << " ";
+    std::cerr << "rate=" << std::setprecision(3) << std::fixed << rate << " (+" << rest << ")" << std::endl;
+    // advance solution one time step
     if (parallel)
-      time_step_parallel(u, c);
+      time_step_parallel(u, v, c);
     else if (iterator)
-      time_step_iterated(u, c);
+      time_step_iterated(u, v, c);
     else
-      time_step_indexed(u, c);
+      time_step_indexed(u, v, c);
   }
 
   return t;
 }
 
 // compute sum of array values
-template <class array2d>
+template <class state>
 inline double
-total(const array2d& u)
+total(const state& u)
 {
   double s = 0;
-  const int nx = u.size_x();
-  const int ny = u.size_y();
-  for (int y = 1; y < ny - 1; y++)
-    for (int x = 1; x < nx - 1; x++)
+  const size_t nx = u.size_x();
+  const size_t ny = u.size_y();
+  for (size_t y = 1; y < ny - 1; y++)
+    for (size_t x = 1; x < nx - 1; x++)
       s += u(x, y);
   return s;
 }
 
 // compute root mean square error with respect to exact solution
-template <class array2d>
+template <class state>
 inline double
-error(const array2d& u, const Constants& c, double t)
+error(const state& u, const Constants& c, double t)
 {
   double e = 0;
-  for (int y = 1; y < c.ny - 1; y++) {
-    double py = c.dy * (y - c.y0);
-    for (int x = 1; x < c.nx - 1; x++) {
-      double px = c.dx * (x - c.x0);
+  for (size_t y = 1; y < c.ny - 1; y++) {
+    double py = c.dy * ((int)y - (int)c.y0);
+    for (size_t x = 1; x < c.nx - 1; x++) {
+      double px = c.dx * ((int)x - (int)c.x0);
       double f = u(x, y);
       double g = c.dx * c.dy * std::exp(-(px * px + py * py) / (4 * c.k * t)) / (4 * c.pi * c.k * t);
       e += (f - g) * (f - g);
@@ -196,93 +284,195 @@ error(const array2d& u, const Constants& c, double t)
   return std::sqrt(e / ((c.nx - 2) * (c.ny - 2)));
 }
 
+// execute solver and evaluate error
+template <class state, class scratch>
+inline void
+execute(state& u, scratch& v, size_t nt, bool iterator, bool parallel)
+{
+  Constants c(u.size_x(), u.size_y(), nt);
+  double t = solve(u, v, c, iterator, parallel);
+  double sum = total(u);
+  double err = error(u, c, t);
+  std::cerr.unsetf(std::ios::fixed);
+  std::cerr << "sum=" << std::setprecision(6) << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl;
+}
+
+// print usage information
 inline int
 usage()
 {
   std::cerr << "Usage: diffusion [options]" << std::endl;
   std::cerr << "Options:" << std::endl;
+  std::cerr << "-a <tolerance> : use compressed arrays with given absolute error tolerance" << std::endl;
+  std::cerr << "-b <blocks> : use 'blocks' 4x4 blocks of cache" << std::endl;
+  std::cerr << "-c : use read-only compressed arrays" << std::endl;
+  std::cerr << "-d : use double-precision tiled arrays" << std::endl;
+  std::cerr << "-f : use single-precision tiled arrays" << std::endl;
+#if WITH_HALF
+  std::cerr << "-h : use half-precision tiled arrays" << std::endl;
+#endif
   std::cerr << "-i : traverse arrays using iterators" << std::endl;
-  std::cerr << "-n <nx> <ny> : number of grid points" << std::endl;
 #ifdef _OPENMP
-  std::cerr << "-p : use multithreading (only with compressed arrays)" << std::endl;
+  std::cerr << "-j : use multithreading (only with compressed arrays)" << std::endl;
 #endif
+  std::cerr << "-n <nx> <ny> : number of grid points" << std::endl;
+  std::cerr << "-p <precision> : use compressed arrays with given precision" << std::endl;
+  std::cerr << "-r <rate> : use compressed arrays with given compressed bits/value" << std::endl;
+  std::cerr << "-R : use compressed arrays with lossless compression" << std::endl;
   std::cerr << "-t <nt> : number of time steps" << std::endl;
-  std::cerr << "-r <rate> : use compressed arrays with 'rate' bits/value" << std::endl;
-  std::cerr << "-c <blocks> : use 'blocks' 4x4 blocks of cache" << std::endl;
   return EXIT_FAILURE;
 }
 
 int main(int argc, char* argv[])
 {
-  int nx = 100;
-  int ny = 100;
-  int nt = 0;
-  double rate = 64;
+  size_t nx = 128;
+  size_t ny = 128;
+  size_t nt = 0;
+  size_t cache_size = 0;
+  zfp_config config = zfp_config_none();
   bool iterator = false;
-  bool compression = false;
   bool parallel = false;
-  int cache = 0;
+  bool writable = true;
+  storage_type type = type_none;
 
   // parse command-line options
   for (int i = 1; i < argc; i++)
-    if (std::string(argv[i]) == "-i")
-      iterator = true;
-    else if (std::string(argv[i]) == "-n") {
-      if (++i == argc || sscanf(argv[i], "%i", &nx) != 1 ||
-          ++i == argc || sscanf(argv[i], "%i", &ny) != 1)
+    if (std::string(argv[i]) == "-a") {
+      double tolerance;
+      if (++i == argc || sscanf(argv[i], "%lf", &tolerance) != 1)
         return usage();
+      config = zfp_config_accuracy(tolerance);
     }
+    else if (std::string(argv[i]) == "-b") {
+      if (++i == argc || (std::istringstream(argv[i]) >> cache_size).fail())
+        return usage();
+      cache_size *= 4 * 4 * sizeof(double);
+    }
+    else if (std::string(argv[i]) == "-c")
+      writable = false;
+    else if (std::string(argv[i]) == "-d")
+      type = type_double;
+    else if (std::string(argv[i]) == "-f")
+      type = type_float;
+#if WITH_HALF
+    else if (std::string(argv[i]) == "-h")
+      type = type_half;
+#endif
+    else if (std::string(argv[i]) == "-i")
+      iterator = true;
 #ifdef _OPENMP
-    else if (std::string(argv[i]) == "-p")
+    else if (std::string(argv[i]) == "-j")
       parallel = true;
 #endif
-    else if (std::string(argv[i]) == "-t") {
-      if (++i == argc || sscanf(argv[i], "%i", &nt) != 1)
+    else if (std::string(argv[i]) == "-n") {
+      if (++i == argc || (std::istringstream(argv[i]) >> nx).fail() ||
+          ++i == argc || (std::istringstream(argv[i]) >> ny).fail())
+        return usage();
+    }
+    else if (std::string(argv[i]) == "-p") {
+      uint precision;
+      if (++i == argc || sscanf(argv[i], "%u", &precision) != 1)
         return usage();
+      config = zfp_config_precision(precision);
     }
     else if (std::string(argv[i]) == "-r") {
+      double rate;
       if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
         return usage();
-      compression = true;
+      config = zfp_config_rate(rate, false);
     }
-    else if (std::string(argv[i]) == "-c") {
-      if (++i == argc || sscanf(argv[i], "%i", &cache) != 1)
+    else if (std::string(argv[i]) == "-R")
+      config = zfp_config_reversible();
+    else if (std::string(argv[i]) == "-t") {
+      if (++i == argc || (std::istringstream(argv[i]) >> nt).fail())
         return usage();
     }
     else
       return usage();
 
+  bool compression = (config.mode != zfp_mode_null);
+
+  // sanity check command-line arguments
   if (parallel && !compression) {
     fprintf(stderr, "multithreading requires compressed arrays\n");
     return EXIT_FAILURE;
   }
+  if (parallel && !writable) {
+    fprintf(stderr, "multithreading requires read-write arrays\n");
+    return EXIT_FAILURE;
+  }
   if (parallel && iterator) {
     fprintf(stderr, "multithreading does not support iterators\n");
     return EXIT_FAILURE;
   }
+  if (compression && writable && config.mode != zfp_mode_fixed_rate) {
+    fprintf(stderr, "compression mode requires read-only arrays (-c)\n");
+    return EXIT_FAILURE;
+  }
+  if (!writable && !compression) {
+    fprintf(stderr, "read-only arrays require compression parameters\n");
+    return EXIT_FAILURE;
+  }
+  if (compression && type != type_none) {
+    fprintf(stderr, "tiled arrays do not support compression parameters\n");
+    return EXIT_FAILURE;
+  }
 
-  Constants c(nx, ny, nt);
+  // if unspecified, set cache size to two layers of blocks
+  if (!cache_size)
+    cache_size = 2 * 4 * nx * sizeof(double);
 
-  double sum;
-  double err;
+  // solve problem
   if (compression) {
-    // solve problem using compressed arrays
-    zfp::array2d u(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double));
-    rate = u.rate();
-    double t = solve(u, c, iterator, parallel);
-    sum = total(u);
-    err = error(u, c, t);
+    // use compressed arrays
+    if (writable) {
+      // use read-write fixed-rate arrays
+      zfp::array2d u(nx, ny, config.arg.rate, 0, cache_size);
+      zfp::array2d v(nx, ny, config.arg.rate, 0, cache_size);
+      execute(u, v, nt, iterator, parallel);
+    }
+    else {
+      // use read-only variable-rate arrays
+      zfp::const_array2d u(nx, ny, config, 0, cache_size);
+      raw::array2d v(nx, ny);
+      execute(u, v, nt, iterator, parallel);
+    }
   }
   else {
-    // solve problem using uncompressed arrays
-    raw::array2d u(nx, ny);
-    double t = solve(u, c, iterator, parallel);
-    sum = total(u);
-    err = error(u, c, t);
+    // use uncompressed arrays
+    switch (type) {
+#if WITH_HALF
+      case type_half: {
+          // use zfp generic codec with tiled half-precision storage
+          tiled::array2h u(nx, ny, sizeof(__fp16) * CHAR_BIT, 0, cache_size);
+          tiled::array2h v(nx, ny, sizeof(__fp16) * CHAR_BIT, 0, cache_size);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+#endif
+      case type_float: {
+          // use zfp generic codec with tiled single-precision storage
+          tiled::array2f u(nx, ny, sizeof(float) * CHAR_BIT, 0, cache_size);
+          tiled::array2f v(nx, ny, sizeof(float) * CHAR_BIT, 0, cache_size);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+      case type_double: {
+          // use zfp generic codec with tiled double-precision storage
+          tiled::array2d u(nx, ny, sizeof(double) * CHAR_BIT, 0, cache_size);
+          tiled::array2d v(nx, ny, sizeof(double) * CHAR_BIT, 0, cache_size);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+      default: {
+          // use uncompressed array with row-major double-precision storage
+          raw::array2d u(nx, ny, sizeof(double) * CHAR_BIT);
+          raw::array2d v(nx, ny, sizeof(double) * CHAR_BIT);
+          execute(u, v, nt, iterator, parallel);
+        }
+        break;
+    }
   }
 
-  std::cerr.unsetf(std::ios::fixed);
-  std::cerr << "rate=" << rate << " sum=" << std::fixed << sum << " error=" << std::setprecision(6) << std::scientific << err << std::endl;
-
   return 0;
 }
diff --git a/examples/diffusionC.c b/examples/diffusionC.c
index 99a5c3db..5e840456 100644
--- a/examples/diffusionC.c
+++ b/examples/diffusionC.c
@@ -1,28 +1,30 @@
-// forward Euler finite difference solution to the heat equation on a 2D grid
-// (ported to C, from diffusion.cpp)
+/*
+forward Euler finite difference solution to the heat equation on a 2D grid
+(ported to C, from diffusion.cpp)
+*/
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
+#include "zfp/array.h"
 
-#include "cfparrays.h"
 #define _ (CFP_NAMESPACE.array2d)
 
 #define MAX(x, y) (((nx) > (ny)) ? (nx) : (ny))
 
-// constants used in the solution
+/* constants used in the solution */
 typedef struct {
-  int nx;        // grid points in x
-  int ny;        // grid points in y
-  int nt;        // number of time steps (0 for default)
-  int x0;        // x location of heat source
-  int y0;        // y location of heat source
-  double k;      // diffusion constant
-  double dx;     // grid spacing in x
-  double dy;     // grid spacing in y
-  double dt;     // time step
-  double tfinal; // minimum time to run solution to
-  double pi;     // 3.141...
+  size_t nx;     /* grid points in x */
+  size_t ny;     /* grid points in y */
+  int nt;        /* number of time steps (0 for default) */
+  int x0;        /* x location of heat source */
+  int y0;        /* y location of heat source */
+  double k;      /* diffusion constant */
+  double dx;     /* grid spacing in x */
+  double dy;     /* grid spacing in y */
+  double dt;     /* time step */
+  double tfinal; /* minimum time to run solution to */
+  double pi;     /* 3.141... */
 } constants;
 
 void
@@ -41,13 +43,13 @@ init_constants(constants* c, int nx, int ny, int nt)
   c->pi = 3.14159265358979323846;
 }
 
-// advance solution using integer array indices
+/* advance solution using integer array indices */
 static void
-time_step_indexed_compressed(cfp_array2d* u, const constants* c)
+time_step_indexed_compressed(cfp_array2d u, const constants* c)
 {
-  // compute du/dt
-  cfp_array2d* du = _.ctor(c->nx, c->ny, _.rate(u), 0, _.cache_size(u));
-  int x, y;
+  /* compute du/dt */
+  cfp_array2d du = _.ctor(c->nx, c->ny, _.rate(u), 0, _.cache_size(u));
+  size_t i, x, y;
   for (y = 1; y < c->ny - 1; y++) {
     for (x = 1; x < c->nx - 1; x++) {
       double uxx = (_.get(u, x - 1, y) - 2 * _.get(u, x, y) + _.get(u, x + 1, y)) / (c->dx * c->dx);
@@ -55,10 +57,9 @@ time_step_indexed_compressed(cfp_array2d* u, const constants* c)
       _.set(du, x, y, c->dt * c->k * (uxx + uyy));
     }
   }
-  // take forward Euler step
-  uint i;
+  /* take forward Euler step */
   for (i = 0; i < _.size(u); i++) {
-    // u[i] += du[i]
+    /* u[i] += du[i] */
     double val = _.get_flat(u, i) + _.get_flat(du, i);
     _.set_flat(u, i, val);
   }
@@ -66,55 +67,84 @@ time_step_indexed_compressed(cfp_array2d* u, const constants* c)
   _.dtor(du);
 }
 
-// advance solution using integer array indices
+/* advance solution using array iterators */
+static void
+time_step_iterated_compressed(cfp_array2d u, const constants* c)
+{
+  /* compute du/dt */
+  cfp_array2d du = _.ctor(c->nx, c->ny, _.rate(u), 0, _.cache_size(u));
+  cfp_iter2d p, q;
+  for (q = _.begin(du); _.iterator.neq(q, _.end(du)); q = _.iterator.inc(q)) {
+    size_t x = _.iterator.i(q);
+    size_t y = _.iterator.j(q);
+    if (1 <= x && x <= c->nx - 2 &&
+        1 <= y && y <= c->ny - 2) {
+      double uxx = (_.get(u, x - 1, y) - 2 * _.get(u, x, y) + _.get(u, x + 1, y)) / (c->dx * c->dx);
+      double uyy = (_.get(u, x, y - 1) - 2 * _.get(u, x, y) + _.get(u, x, y + 1)) / (c->dy * c->dy);
+      _.iterator.set(q, c->dt * c->k * (uxx + uyy));
+    }
+  }
+  /* take forward Euler step */
+  for (p = _.begin(u), q = _.begin(du); _.iterator.neq(p, _.end(u)); p = _.iterator.inc(p), q = _.iterator.inc(q)) {
+    /* u[i] += du[i] */
+    double val = _.iterator.get(p) + _.iterator.get(q);
+    _.iterator.set(p, val);
+  }
+
+  _.dtor(du);
+}
+
+/* advance solution using integer array indices */
 static void
 time_step_indexed(double* u, const constants* c)
 {
-  // compute du/dt
+  /* compute du/dt */
   double* du = calloc(c->nx * c->ny, sizeof(double));
-  int x, y;
-  for (y = 1; y < c->ny - 1; y++) {
+  size_t i, x, y;
+  for (y = 1; y < c->ny - 1; y++)
     for (x = 1; x < c->nx - 1; x++) {
-      double uxx = (u[y*c->nx + (x - 1)] - 2 * u[y*c->nx + x] + u[y*c->nx + (x + 1)]) / (c->dx * c->dx);
-      double uyy = (u[(y - 1)*c->nx + x] - 2 * u[y*c->nx + x] + u[(y + 1)*c->nx + x]) / (c->dy * c->dy);
-      du[y*c->nx + x] = c->dt * c->k * (uxx + uyy);
+      double uxx = (u[(x - 1) + c->nx * y] - 2 * u[x + c->nx * y] + u[(x + 1) + c->nx * y]) / (c->dx * c->dx);
+      double uyy = (u[x + c->nx * (y - 1)] - 2 * u[x + c->nx * y] + u[x + c->nx * (y + 1)]) / (c->dy * c->dy);
+      du[x + c->nx * y] = c->dt * c->k * (uxx + uyy);
     }
-  }
-  // take forward Euler step
-  uint i;
-  for (i = 0; i < (c->nx * c->ny); i++) {
-    // u[i] += du[i]
+  /* take forward Euler step */
+  for (i = 0; i < c->nx * c->ny; i++)
     u[i] += du[i];
-  }
 
   free(du);
 }
 
-// solve heat equation using 
+/* solve heat equation using compressed arrays */
 static double
-solve_compressed(cfp_array2d* u, const constants* c)
+solve_compressed(cfp_array2d u, const constants* c, zfp_bool iterator)
 {
-  // initialize u with point heat source (u is assumed to be zero initialized)
+  double t;
+
+  /* initialize u with point heat source (u is assumed to be zero initialized) */
   _.set(u, c->x0, c->y0, 1);
 
-  // iterate until final time
-  double t;
+  /* iterate until final time */
   for (t = 0; t < c->tfinal; t += c->dt) {
     fprintf(stderr, "t=%lf\n", t);
-    time_step_indexed_compressed(u, c);
+    if (iterator)
+      time_step_iterated_compressed(u, c);
+    else
+      time_step_indexed_compressed(u, c);
   }
 
   return t;
 }
 
+/* solve heat equation using uncompressed arrays */
 static double
 solve(double* u, const constants* c)
 {
-  // initialize u with point heat source (u is assumed to be zero initialized)
-  u[c->y0*c->nx + c->x0] = 1;
-
-  // iterate until final time
   double t;
+
+  /* initialize u with point heat source (u is assumed to be zero initialized) */
+  u[c->x0 + c->nx * c->y0] = 1;
+
+  /* iterate until final time */
   for (t = 0; t < c->tfinal; t += c->dt) {
     fprintf(stderr, "t=%lf\n", t);
     time_step_indexed(u, c);
@@ -123,42 +153,42 @@ solve(double* u, const constants* c)
   return t;
 }
 
-// compute sum of array values
+/* compute sum of array values */
 static double
-total_compressed(const cfp_array2d* u)
+total_compressed(const cfp_array2d u)
 {
   double s = 0;
-  const int nx = _.size_x(u);
-  const int ny = _.size_y(u);
-  int x, y;
+  const size_t nx = _.size_x(u);
+  const size_t ny = _.size_y(u);
+  size_t x, y;
   for (y = 1; y < ny - 1; y++)
     for (x = 1; x < nx - 1; x++)
       s += _.get(u, x, y);
   return s;
 }
 
-// compute sum of array values
+/* compute sum of array values */
 static double
-total(const double* u, const int nx, const int ny)
+total(const double* u, size_t nx, size_t ny)
 {
   double s = 0;
-  int x, y;
+  size_t x, y;
   for (y = 1; y < ny - 1; y++)
     for (x = 1; x < nx - 1; x++)
-      s += u[y*nx + x];
+      s += u[x + nx * y];
   return s;
 }
 
-// compute root mean square error with respect to exact solution
+/* compute root mean square error with respect to exact solution */
 static double
-error_compressed(const cfp_array2d* u, const constants* c, double t)
+error_compressed(const cfp_array2d u, const constants* c, double t)
 {
   double e = 0;
-  int x, y;
+  size_t x, y;
   for (y = 1; y < c->ny - 1; y++) {
-    double py = c->dy * (y - c->y0);
+    double py = c->dy * ((int)y - (int)c->y0);
     for (x = 1; x < c->nx - 1; x++) {
-      double px = c->dx * (x - c->x0);
+      double px = c->dx * ((int)x - (int)c->x0);
       double f = _.get(u, x, y);
       double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t);
       e += (f - g) * (f - g);
@@ -167,17 +197,17 @@ error_compressed(const cfp_array2d* u, const constants* c, double t)
   return sqrt(e / ((c->nx - 2) * (c->ny - 2)));
 }
 
-// compute root mean square error with respect to exact solution
+/* compute root mean square error with respect to exact solution */
 static double
 error(const double* u, const constants* c, double t)
 {
   double e = 0;
-  int x, y;
+  size_t x, y;
   for (y = 1; y < c->ny - 1; y++) {
-    double py = c->dy * (y - c->y0);
+    double py = c->dy * ((int)y - (int)c->y0);
     for (x = 1; x < c->nx - 1; x++) {
-      double px = c->dx * (x - c->x0);
-      double f = u[y*c->nx + x];
+      double px = c->dx * ((int)x - (int)c->x0);
+      double f = u[x + c->nx * y];
       double g = c->dx * c->dy * exp(-(px * px + py * py) / (4 * c->k * t)) / (4 * c->pi * c->k * t);
       e += (f - g) * (f - g);
     }
@@ -190,72 +220,78 @@ usage()
 {
   fprintf(stderr, "Usage: diffusionC [options]\n");
   fprintf(stderr, "Options:\n");
+  fprintf(stderr, "-b <blocks> : use 'blocks' 4x4 blocks of cache\n");
+  fprintf(stderr, "-i : traverse arrays using iterators\n");
   fprintf(stderr, "-n <nx> <ny> : number of grid points\n");
+  fprintf(stderr, "-r <rate> : use compressed arrays with given compressed bits/value\n");
   fprintf(stderr, "-t <nt> : number of time steps\n");
-  fprintf(stderr, "-r <rate> : use compressed arrays with 'rate' bits/value\n");
-  fprintf(stderr, "-c <blocks> : use 'blocks' 4x4 blocks of cache\n");
   return EXIT_FAILURE;
 }
 
 int main(int argc, char* argv[])
 {
-  int nx = 100;
-  int ny = 100;
+  int nx = 128;
+  int ny = 128;
   int nt = 0;
+  int cache_size = 0;
   double rate = 64;
-  int compression = 0;
-  int cache = 0;
+  zfp_bool iterator = zfp_false;
+  zfp_bool compression = zfp_false;
+  constants* c = 0;
+  double sum;
+  double err;
 
-  // parse command-line options
+  /* parse command-line options */
   int i;
   for (i = 1; i < argc; i++) {
     if (argv[i][0] != '-' || argv[i][2])
       return usage();
     switch(argv[i][1]) {
+      case 'b':
+        if (++i == argc || sscanf(argv[i], "%d", &cache_size) != 1)
+          return usage();
+        cache_size *= (int)(4 * 4 * sizeof(double));
+        break;
+      case 'i':
+        iterator = zfp_true;
+        break;
       case 'n':
         if (++i == argc || sscanf(argv[i], "%d", &nx) != 1 ||
             ++i == argc || sscanf(argv[i], "%d", &ny) != 1)
           return usage();
         break;
-      case 't':
-        if (++i == argc || sscanf(argv[i], "%d", &nt) != 1)
-          return usage();
-        break;
       case 'r':
         if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
           return usage();
-        compression = 1;
+        compression = zfp_true;
         break;
-      case 'c':
-        if (++i == argc || sscanf(argv[i], "%d", &cache) != 1)
+      case 't':
+        if (++i == argc || sscanf(argv[i], "%d", &nt) != 1)
           return usage();
+        break;
+      default:
+        return usage();
     }
   }
 
-  constants* c = malloc(sizeof(constants));
+  c = malloc(sizeof(constants));
   init_constants(c, nx, ny, nt);
 
-  double sum;
-  double err;
   if (compression) {
-    // solve problem using compressed arrays
-    cfp_array2d* u = _.ctor(nx, ny, rate, 0, cache * 4 * 4 * sizeof(double));
-
-    rate = _.rate(u);
-    double t = solve_compressed(u, c);
+    /* solve problem using compressed arrays */
+    cfp_array2d u = _.ctor(nx, ny, rate, 0, cache_size);
+    double t = solve_compressed(u, c, iterator);
     sum = total_compressed(u);
     err = error_compressed(u, c, t);
-
+    rate = _.rate(u);
     _.dtor(u);
   }
   else {
-    // solve problem using primitive arrays
+    /* solve problem using primitive arrays */
     double* u = calloc(nx * ny, sizeof(double));
-
     double t = solve(u, c);
     sum = total(u, nx, ny);
     err = error(u, c, t);
-
     free(u);
   }
 
diff --git a/examples/inplace.c b/examples/inplace.c
index 3764166b..9516240d 100644
--- a/examples/inplace.c
+++ b/examples/inplace.c
@@ -46,7 +46,7 @@ process(double* buffer, uint blocks, double tolerance)
   ptr = buffer;
   for (i = 0; i < blocks; i++) {
     offset[i] = stream_wtell(stream);
-    bits = zfp_encode_block_double_2(zfp, ptr);
+    bits = (uint)zfp_encode_block_double_2(zfp, ptr);
     if (!bits) {
       fprintf(stderr, "compression failed\n");
       return 0;
diff --git a/examples/iterator.cpp b/examples/iterator.cpp
index 698692ff..94f907de 100644
--- a/examples/iterator.cpp
+++ b/examples/iterator.cpp
@@ -1,9 +1,9 @@
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
-#include "zfparray1.h"
-#include "zfparray2.h"
-#include "zfparray3.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
 
 void print1(zfp::array1<double>::pointer p, size_t n)
 {
@@ -17,9 +17,9 @@ void print2(zfp::array2<double>::pointer p, size_t n)
     std::cout << *p++ << std::endl;
 }
 
-void print3(zfp::array1<double>::iterator begin, zfp::array1<double>::iterator end)
+void print3(zfp::array1<double>::const_iterator begin, zfp::array1<double>::const_iterator end)
 {
-  for (zfp::array1<double>::iterator p = begin; p != end; p++)
+  for (zfp::array1<double>::const_iterator p = begin; p != end; p++)
     std::cout << *p << std::endl;
 }
 
diff --git a/examples/iteratorC.c b/examples/iteratorC.c
new file mode 100644
index 00000000..8106c493
--- /dev/null
+++ b/examples/iteratorC.c
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "zfp/array.h"
+
+void print1(cfp_ptr1d p, size_t n)
+{
+  size_t i;
+  const cfp_array1d_api _ = cfp.array1d;
+
+  for (i = 0; i < n; i++)
+    printf("%g\n", _.reference.get(_.pointer.ref_at(p, i)));
+}
+
+void print2(cfp_ptr2d p, size_t n)
+{
+  const cfp_array2d_api _ = cfp.array2d;
+
+  while (n--) {
+    printf("%g\n", _.reference.get(_.pointer.ref(p)));
+    p = _.pointer.inc(p);
+  }
+}
+
+void print3(cfp_iter1d begin, cfp_iter1d end)
+{
+  const cfp_array1d_api _ = cfp.array1d;
+  cfp_iter1d p;
+
+  for (p = begin; !_.iterator.eq(p, end); p = _.iterator.inc(p))
+    printf("%g\n", _.reference.get(_.iterator.ref(p)));
+}
+
+int main()
+{
+  const cfp_array1d_api _1d = cfp.array1d;
+  const cfp_array2d_api _2d = cfp.array2d;
+  const cfp_array3d_api _3d = cfp.array3d;
+  cfp_array1d v;
+  cfp_iter1d it1;
+  cfp_array2d a;
+  cfp_iter2d it2;
+  cfp_ptr2d pb2;
+  cfp_ptr2d pe2;
+  cfp_array3d b;
+  cfp_iter3d it3;
+  cfp_ptr3d pb3;
+  cfp_ptr3d pe3;
+  size_t i, j, k;
+
+  /* some fun with 1D arrays */
+  v = _1d.ctor(10, 64.0, 0, 0);
+  /* initialize and print array of random values */
+  for (it1 = _1d.begin(v); !_1d.iterator.eq(it1, _1d.end(v)); it1 = _1d.iterator.inc(it1))
+    _1d.reference.set(_1d.iterator.ref(it1), rand());
+  printf("random array\n");
+  print1(_1d.ptr(v, 0), _1d.size(v)); 
+  printf("\n");
+
+  /* some fun with 2D arrays */
+  a = _2d.ctor(5, 7, 64.0, 0, 0);
+  /* print array indices visited in block-order traversal*/
+  printf("block order (x, y) indices\n");
+  for (it2 = _2d.begin(a); !_2d.iterator.eq(it2, _2d.end(a)); it2 = _2d.iterator.inc(it2)) {
+    i = _2d.iterator.i(it2);
+    j = _2d.iterator.j(it2);
+    printf("(%lu, %lu)\n", (unsigned long)i, (unsigned long)j);
+    _2d.reference.set(_2d.iterator.ref(it2), i + 10 * j);
+  }
+  printf("\n");
+
+  /* print array contents in row-major order */
+  printf("row-major order yx indices\n");
+  print2(_2d.ptr_flat(a, 0), _2d.size(a));
+  printf("\n");
+  /* pointer arithmetic */
+  pb2 = _2d.reference.ptr(_2d.iterator.ref(_2d.begin(a)));
+  pe2 = _2d.reference.ptr(_2d.iterator.ref(_2d.end(a)));
+  printf("%lu * %lu = %ld\n", (unsigned long)_2d.size_x(a), (unsigned long)_2d.size_y(a), (long)_2d.pointer.distance(pb2, pe2));
+
+  /* some fun with 3D arrays */
+  b = _3d.ctor(7, 2, 5, 64.0, 0, 0);
+  /* print array indices visited in block-order traversal */
+  printf("block order (x, y, z) indices\n");
+  for (it3 = _3d.begin(b); !_3d.iterator.eq(it3, _3d.end(b)); it3 = _3d.iterator.inc(it3)) {
+    i = _3d.iterator.i(it3);
+    j = _3d.iterator.j(it3);
+    k = _3d.iterator.k(it3);
+    printf("(%lu, %lu, %lu)\n", (unsigned long)i, (unsigned long)j, (unsigned long)k);
+  }
+  printf("\n");
+  /* pointer arithmetic */
+  pb3 = _3d.reference.ptr(_3d.iterator.ref(_3d.begin(b)));
+  pe3 = _3d.reference.ptr(_3d.iterator.ref(_3d.end(b)));
+  printf("%lu * %lu * %lu = %ld\n", (unsigned long)_3d.size_x(b), (unsigned long)_3d.size_y(b), (unsigned long)_3d.size_z(b), (long)_3d.pointer.distance(pb3, pe3));
+
+  return 0;
+}
diff --git a/examples/pgm.c b/examples/pgm.c
index c23ecb2d..ce580dc7 100644
--- a/examples/pgm.c
+++ b/examples/pgm.c
@@ -60,7 +60,7 @@ int main(int argc, char* argv[])
   if (rate < 0)
     zfp_stream_set_precision(zfp, (uint)floor(0.5 - rate));
   else
-    zfp_stream_set_rate(zfp, rate, zfp_type_int32, 2, 0);
+    zfp_stream_set_rate(zfp, rate, zfp_type_int32, 2, zfp_false);
   bytes = zfp_stream_maximum_size(zfp, field);
   buffer = malloc(bytes);
   stream = stream_open(buffer, bytes);
diff --git a/examples/ppm.c b/examples/ppm.c
new file mode 100644
index 00000000..4b989a30
--- /dev/null
+++ b/examples/ppm.c
@@ -0,0 +1,390 @@
+/*
+This simple example shows how zfp can be used to compress 8-bit color images
+stored in the PPM image format.  This lossy compressor employs two common image
+compression strategies: (1) transformation to the YCoCg color space, which
+decorrelates color bands, and (2) chroma subsampling, which reduces spatial
+resolution in the Co and Cg chrominance bands.  The single command-line argument
+selects one of two compression modes: if a positive rate (in bits/pixel) is
+specified, fixed-rate mode is selected; a negative integer argument, -p, sets
+the precision to p in fixed-precision mode.  Rate allocation in fixed-rate mode
+assigns more bits to luma than to chroma components due to the relatively higher
+information content in luma after chroma subsampling.
+
+The YCoCg transform employed here has been adapted to avoid range expansion and
+potential overflow.  Chroma subsampling is achieved by performing zfp's forward
+decorrelating transform and then zeroing all but the four lowest-sequency
+coefficients, effectively reducing each chroma block to a bilinear approximation.
+
+Because only four chroma coefficients per 4x4 pixel block are retained, an
+alternative to zeroing and then encoding the remaining twelve zero-valued
+coefficients is to treat the chroma block as being one-dimensional, with only
+four values, and then compressing it using zfp's 1D codec.  The dimensionality
+of chroma blocks (1 or 2) is specified at compile time via the PPM_CHROMA macro.
+
+NOTE: To keep this example simple, only images whose dimensions are multiples
+of four are supported.
+*/
+
+#ifdef PPM_CHROMA
+  #if PPM_CHROMA != 1 && PPM_CHROMA != 2
+    #error "compile with PPM_CHROMA=1 or PPM_CHROMA=2"
+  #endif
+#else
+  /* default */
+  #define PPM_CHROMA 2
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "zfp.h"
+
+/* clamp values to 31-bit range */
+static void
+clamp(int32* block, uint n)
+{
+  uint i;
+  for (i = 0; i < n; i++) {
+    if (block[i] < 1 - (1 << 30))
+      block[i] = 1 - (1 << 30);
+    if (block[i] > (1 << 30) - 1)
+      block[i] = (1 << 30) - 1;
+  }
+}
+
+/* convert 2D block from RGB to YCoCg color space */
+static void
+rgb2ycocg(int32 ycocg[3][16], /*const*/ int32 rgb[3][16])
+{
+  uint i;
+  for (i = 0; i < 16; i++) {
+    int32 r, g, b;
+    int32 y, co, cg, t;
+    /* fetch RGB values */
+    r = rgb[0][i];
+    g = rgb[1][i];
+    b = rgb[2][i];
+    /* perform range-preserving YCoCg forward transform */
+    co = (r - b) >> 1;
+    t = b + co;
+    cg = (g - t) >> 1;
+    y = t + cg;
+    /* store YCoCg values */
+    ycocg[0][i] = y;
+    ycocg[1][i] = co;
+    ycocg[2][i] = cg;
+  }
+}
+
+/* convert 2D block from YCoCg to RGB color space */
+static void
+ycocg2rgb(int32 rgb[3][16], /*const*/ int32 ycocg[3][16])
+{
+  uint i;
+  for (i = 0; i < 16; i++) {
+    int32 r, g, b;
+    int32 y, co, cg, t;
+    /* fetch YCoCg values */
+    y = ycocg[0][i];
+    co = ycocg[1][i];
+    cg = ycocg[2][i];
+    /* perform range-preserving YCoCg inverse transform */
+    t = y - cg;
+    g = (cg << 1) + t;
+    b = t - co;
+    r = (co << 1) + b;
+    /* store RGB values */
+    rgb[0][i] = r;
+    rgb[1][i] = g;
+    rgb[2][i] = b;
+  }
+}
+
+/* perform partial forward decorrelating transform */
+static void
+fwd_lift(int32* p, uint s)
+{
+  int32 x, y, z, w;
+  x = *p; p += s;
+  y = *p; p += s;
+  z = *p; p += s;
+  w = *p; p += s;
+
+  x += w; x >>= 1; w -= x;
+  z += y; z >>= 1; y -= z;
+  x += z; x >>= 1; z -= x;
+  w += y; w >>= 1; y -= w;
+  w += y >> 1; y -= w >> 1;
+
+  p -= s; *p = w;
+  p -= s; *p = z;
+  p -= s; *p = y;
+  p -= s; *p = x;
+}
+
+/* perform partial inverse decorrelating transform */
+static void
+inv_lift(int32* p, uint s)
+{
+  int32 x, y, z, w;
+  x = *p; p += s;
+  y = *p; p += s;
+  z = *p; p += s;
+  w = *p; p += s;
+
+  y += w >> 1; w -= y >> 1;
+  y += w; w <<= 1; w -= y;
+  z += x; x <<= 1; x -= z;
+  y += z; z <<= 1; z -= y;
+  w += x; x <<= 1; x -= w;
+
+  p -= s; *p = w;
+  p -= s; *p = z;
+  p -= s; *p = y;
+  p -= s; *p = x;
+}
+
+/* perform chroma subsampling by discarding high-frequency components */
+static void
+chroma_downsample(int32* block)
+{
+  uint i, j;
+  /* perform forward decorrelating transform */
+  for (j = 0; j < 4; j++)
+    fwd_lift(block + 4 * j, 1);
+  for (i = 0; i < 4; i++)
+    fwd_lift(block + 1 * i, 4);
+#if PPM_CHROMA == 1
+  /* keep only the four lowest-sequency coefficients */
+  block[2] = block[4];
+  block[3] = block[5];
+  for (i = 4; i < 16; i++)
+    block[i] = 0;
+  /* reconstruct as 1D block */
+  inv_lift(block, 1);
+  /* clamp values to 31 bits to avoid overflow */
+  clamp(block, 4);
+#else
+  /* zero out all but four lowest-sequency coefficients */
+  for (j = 0; j < 4; j++)
+    for (i = 0; i < 4; i++)
+      if (i >= 2 || j >= 2)
+        block[i + 4 * j] = 0;
+  /* perform inverse decorrelating transform */
+  for (i = 0; i < 4; i++)
+    inv_lift(block + 1 * i, 4);
+  for (j = 0; j < 4; j++)
+    inv_lift(block + 4 * j, 1);
+  /* clamp values to 31 bits to avoid overflow */
+  clamp(block, 16);
+#endif
+}
+
+/* reconstruct 2D chroma block */
+static void
+chroma_upsample(int32* block)
+{
+#if PPM_CHROMA == 1
+  uint i, j;
+  /* obtain 1D block coefficients */
+  fwd_lift(block, 1);
+  /* reorganize and initialize remaining 2D block coefficients */
+  block[4] = block[2];
+  block[5] = block[3];
+  block[2] = 0;
+  block[3] = 0;
+  for (i = 6; i < 16; i++)
+    block[i] = 0;
+  /* perform inverse decorrelating transform */
+  for (i = 0; i < 4; i++)
+    inv_lift(block + 1 * i, 4);
+  for (j = 0; j < 4; j++)
+    inv_lift(block + 4 * j, 1);
+  /* clamp values to 31 bits to avoid overflow */
+  clamp(block, 16);
+#else
+  /* clamp values to 31 bits to avoid overflow */
+  clamp(block, 16);
+#endif
+}
+
+int main(int argc, char* argv[])
+{
+  double rate = 0;
+  uint nx, ny;
+  uint x, y;
+  uint k;
+  char line[0x100];
+  uchar* image;
+  zfp_field* field;
+  zfp_stream* zfp[3];
+  bitstream* stream;
+  void* buffer;
+  size_t bytes;
+  size_t size;
+
+  switch (argc) {
+    case 2:
+      if (sscanf(argv[1], "%lf", &rate) != 1)
+        goto usage;
+      break;
+    default:
+    usage:
+      fprintf(stderr, "Usage: ppm <rate|-precision> <input.ppm >output.ppm\n");
+      return EXIT_FAILURE;
+  }
+
+  /* read ppm header */
+  if (!fgets(line, sizeof(line), stdin) || strcmp(line, "P6\n") ||
+      !fgets(line, sizeof(line), stdin) || sscanf(line, "%u%u", &nx, &ny) != 2 ||
+      !fgets(line, sizeof(line), stdin) || strcmp(line, "255\n")) {
+    fprintf(stderr, "error opening image\n");
+    return EXIT_FAILURE;
+  }
+  if ((nx & 3u) || (ny & 3u)) {
+    fprintf(stderr, "image dimensions must be multiples of four\n");
+    return EXIT_FAILURE;
+  }
+
+  /* read image data */
+  image = malloc(3 * nx * ny);
+  if (!image) {
+    fprintf(stderr, "error allocating memory\n");
+    return EXIT_FAILURE;
+  }
+  if (fread(image, sizeof(*image), 3 * nx * ny, stdin) != 3 * nx * ny) {
+    fprintf(stderr, "error reading image\n");
+    return EXIT_FAILURE;
+  }
+
+  /* initialize compressed streams */
+  for (k = 0; k < 3; k++)
+    zfp[k] = zfp_stream_open(NULL);
+  if (rate < 0) {
+    /* use fixed-precision mode */
+    for (k = 0; k < 3; k++)
+      zfp_stream_set_precision(zfp[k], (uint)floor(0.5 - rate));
+  }
+  else {
+    /* assign higher rate to luminance than to chrominance components */
+#if PPM_CHROMA == 1
+    double chroma_rate = floor(8 * rate / 3 + 0.5) / 4;
+    double luma_rate = rate - chroma_rate / 2;
+    zfp_stream_set_rate(zfp[0], luma_rate, zfp_type_int32, 2, zfp_false);
+    zfp_stream_set_rate(zfp[1], chroma_rate, zfp_type_int32, 1, zfp_false);
+    zfp_stream_set_rate(zfp[2], chroma_rate, zfp_type_int32, 1, zfp_false);
+#else
+    double chroma_rate = floor(8 * rate / 3 + 0.5) / 16;
+    double luma_rate = rate - 2 * chroma_rate;
+    zfp_stream_set_rate(zfp[0], luma_rate, zfp_type_int32, 2, zfp_false);
+    zfp_stream_set_rate(zfp[1], chroma_rate, zfp_type_int32, 2, zfp_false);
+    zfp_stream_set_rate(zfp[2], chroma_rate, zfp_type_int32, 2, zfp_false);
+#endif
+  }
+
+  /* determine size of compressed buffer */
+  bytes = 0;
+  field = zfp_field_2d(image, zfp_type_int32, nx, ny);
+  for (k = 0; k < 3; k++)
+    bytes += zfp_stream_maximum_size(zfp[k], field);
+  zfp_field_free(field);
+
+  /* allocate buffer and initialize bit stream */
+  buffer = malloc(bytes);
+  if (!buffer) {
+    fprintf(stderr, "error allocating memory\n");
+    return EXIT_FAILURE;
+  }
+  stream = stream_open(buffer, bytes);
+
+  /* the three zfp streams share a single bit stream */
+  for (k = 0; k < 3; k++)
+    zfp_stream_set_bit_stream(zfp[k], stream);
+
+  /* compress image */
+  for (y = 0; y < ny; y += 4)
+    for (x = 0; x < nx; x += 4) {
+      uchar block[3][16];
+      int32 rgb[3][16];
+      int32 ycocg[3][16];
+      uint i, j, k;
+      /* fetch R, G, and B blocks */
+      for (k = 0; k < 3; k++)
+        for (j = 0; j < 4; j++)
+          for (i = 0; i < 4; i++)
+            block[k][i + 4 * j] = image[k + 3 * (x + i + nx * (y + j))];
+      /* promote to 32-bit integers */
+      for (k = 0; k < 3; k++)
+        zfp_promote_uint8_to_int32(rgb[k], block[k], 2);
+      /* perform color space transform */
+      rgb2ycocg(ycocg, rgb);
+      /* chroma subsample the Co and Cg bands */
+      for (k = 1; k < 3; k++)
+        chroma_downsample(ycocg[k]);
+      /* compress the Y, Co, and Cg blocks */
+#if PPM_CHROMA == 1
+      zfp_encode_block_int32_2(zfp[0], ycocg[0]);
+      zfp_encode_block_int32_1(zfp[1], ycocg[1]);
+      zfp_encode_block_int32_1(zfp[2], ycocg[2]);
+#else
+      for (k = 0; k < 3; k++)
+        zfp_encode_block_int32_2(zfp[k], ycocg[k]);
+#endif
+    }
+
+  zfp_stream_flush(zfp[0]);
+  size = zfp_stream_compressed_size(zfp[0]);
+  fprintf(stderr, "%u compressed bytes (%.2f bits/pixel)\n", (uint)size, (double)size * CHAR_BIT / (nx * ny));
+
+  /* decompress image */
+  zfp_stream_rewind(zfp[0]);
+  for (y = 0; y < ny; y += 4)
+    for (x = 0; x < nx; x += 4) {
+      uchar block[3][16];
+      int32 rgb[3][16];
+      int32 ycocg[3][16];
+      uint i, j, k;
+      /* decompress the Y, Co, and Cg blocks */
+#if PPM_CHROMA == 1
+      zfp_decode_block_int32_2(zfp[0], ycocg[0]);
+      zfp_decode_block_int32_1(zfp[1], ycocg[1]);
+      zfp_decode_block_int32_1(zfp[2], ycocg[2]);
+#else
+      for (k = 0; k < 3; k++)
+        zfp_decode_block_int32_2(zfp[k], ycocg[k]);
+#endif
+      /* reconstruct Co and Cg chroma bands */
+      for (k = 1; k < 3; k++)
+        chroma_upsample(ycocg[k]);
+      /* perform color space transform */
+      ycocg2rgb(rgb, ycocg);
+      /* demote to 8-bit integers */
+      for (k = 0; k < 3; k++)
+        zfp_demote_int32_to_uint8(block[k], rgb[k], 2);
+      /* store R, G, and B blocks */
+      for (k = 0; k < 3; k++)
+        for (j = 0; j < 4; j++)
+          for (i = 0; i < 4; i++)
+            image[k + 3 * (x + i + nx * (y + j))] = block[k][i + 4 * j];
+    }
+
+  /* clean up */
+  for (k = 0; k < 3; k++)
+    zfp_stream_close(zfp[k]);
+  stream_close(stream);
+  free(buffer);
+
+  /* output reconstructed image */
+  printf("P6\n");
+  printf("%u %u\n", nx, ny);
+  printf("255\n");
+  if (fwrite(image, sizeof(*image), 3 * nx * ny, stdout) != 3 * nx * ny) {
+    fprintf(stderr, "error writing image\n");
+    return EXIT_FAILURE;
+  }
+  free(image);
+
+  return 0;
+}
diff --git a/examples/simple.c b/examples/simple.c
index 2ccb5977..d2261301 100644
--- a/examples/simple.c
+++ b/examples/simple.c
@@ -8,7 +8,7 @@
 
 /* compress or decompress array */
 static int
-compress(double* array, int nx, int ny, int nz, double tolerance, int decompress)
+compress(double* array, size_t nx, size_t ny, size_t nz, double tolerance, zfp_bool decompress)
 {
   int status = 0;    /* return value: 0 = success */
   zfp_type type;     /* array scalar type */
@@ -26,8 +26,9 @@ compress(double* array, int nx, int ny, int nz, double tolerance, int decompress
   /* allocate meta data for a compressed stream */
   zfp = zfp_stream_open(NULL);
 
-  /* set compression mode and parameters via one of three functions */
-/*  zfp_stream_set_rate(zfp, rate, type, 3, 0); */
+  /* set compression mode and parameters via one of four functions */
+/*  zfp_stream_set_reversible(zfp); */
+/*  zfp_stream_set_rate(zfp, rate, type, zfp_field_dimensionality(field), zfp_false); */
 /*  zfp_stream_set_precision(zfp, precision); */
   zfp_stream_set_accuracy(zfp, tolerance);
 
@@ -42,19 +43,21 @@ compress(double* array, int nx, int ny, int nz, double tolerance, int decompress
 
   /* compress or decompress entire array */
   if (decompress) {
-    /* read compressed stream and decompress array */
+    /* read compressed stream and decompress and output array */
     zfpsize = fread(buffer, 1, bufsize, stdin);
     if (!zfp_decompress(zfp, field)) {
       fprintf(stderr, "decompression failed\n");
-      status = 1;
+      status = EXIT_FAILURE;
     }
+    else
+      fwrite(array, sizeof(double), zfp_field_size(field, NULL), stdout);
   }
   else {
     /* compress array and output compressed stream */
     zfpsize = zfp_compress(zfp, field);
     if (!zfpsize) {
       fprintf(stderr, "compression failed\n");
-      status = 1;
+      status = EXIT_FAILURE;
     }
     else
       fwrite(buffer, 1, zfpsize, stdout);
@@ -73,17 +76,17 @@ compress(double* array, int nx, int ny, int nz, double tolerance, int decompress
 int main(int argc, char* argv[])
 {
   /* use -d to decompress rather than compress data */
-  int decompress = (argc == 2 && !strcmp(argv[1], "-d"));
+  zfp_bool decompress = (argc == 2 && !strcmp(argv[1], "-d"));
 
   /* allocate 100x100x100 array of doubles */
-  int nx = 100;
-  int ny = 100;
-  int nz = 100;
+  size_t nx = 100;
+  size_t ny = 100;
+  size_t nz = 100;
   double* array = malloc(nx * ny * nz * sizeof(double));
 
   if (!decompress) {
     /* initialize array to be compressed */
-    int i, j, k;
+    size_t i, j, k;
     for (k = 0; k < nz; k++)
       for (j = 0; j < ny; j++)
         for (i = 0; i < nx; i++) {
diff --git a/examples/speed.c b/examples/speed.c
index 9332605d..e75f4285 100644
--- a/examples/speed.c
+++ b/examples/speed.c
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
 
   /* allocate storage for compressed bit stream */
   zfp = zfp_stream_open(NULL);
-  zfp_stream_set_rate(zfp, rate, zfp_field_type(field), zfp_field_dimensionality(field), 0);
+  zfp_stream_set_rate(zfp, rate, zfp_field_type(field), zfp_field_dimensionality(field), zfp_false);
   bytes = zfp_stream_maximum_size(zfp, field);
   buffer = malloc(bytes);
   stream = stream_open(buffer, bytes);
diff --git a/fortran/CMakeLists.txt b/fortran/CMakeLists.txt
index 22381df4..9c376348 100644
--- a/fortran/CMakeLists.txt
+++ b/fortran/CMakeLists.txt
@@ -13,5 +13,19 @@ set(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/modules)
 set(CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} ${bounds}")
 set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${dialect}")
 
-add_library(zFORp zfp.f)
+add_library(zFORp zfp.f90)
 target_link_libraries(zFORp PRIVATE zfp)
+
+set_property(TARGET zFORp PROPERTY VERSION ${ZFP_VERSION})
+set_property(TARGET zFORp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
+set_property(TARGET zFORp PROPERTY OUTPUT_NAME ${ZFP_LIBRARY_PREFIX}zFORp)
+
+# install location for module file
+install(FILES ${CMAKE_Fortran_MODULE_DIRECTORY}/zfp.mod
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# install location for library
+install(TARGETS zFORp EXPORT cFORp-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/fortran/Makefile b/fortran/Makefile
index 229bf42c..9e514868 100644
--- a/fortran/Makefile
+++ b/fortran/Makefile
@@ -1,14 +1,16 @@
 include ../Config
 
+.SUFFIXES: .f90
+
 LIBDIR = ../lib
 MODDIR = ../modules
-TARGETS = $(LIBDIR)/libzFORp.a $(LIBDIR)/libzFORp.so $(MODDIR)/zforp_module.mod
+TARGETS = $(LIBDIR)/libzFORp.a $(LIBDIR)/libzFORp.so $(MODDIR)/zfp.mod
 OBJECTS = zfp.o
-MODULES = zforp_module.mod
+MODULES = zfp.mod
 
-static: $(LIBDIR)/libzFORp.a $(MODDIR)/zforp_module.mod
+static: $(LIBDIR)/libzFORp.a $(MODDIR)/zforp.mod
 
-shared: $(LIBDIR)/libzFORp.so $(MODDIR)/zforp_module.mod
+shared: $(LIBDIR)/libzFORp.so $(MODDIR)/zforp.mod
 
 clean:
 	rm -f $(TARGETS) $(OBJECTS)
@@ -22,9 +24,9 @@ $(LIBDIR)/libzFORp.so: $(OBJECTS)
 	mkdir -p $(LIBDIR)
 	$(FC) $(FFLAGS) -shared $^ -o $@
 
-$(MODDIR)/zforp_module.mod: $(OBJECTS)
+$(MODDIR)/zforp.mod: $(OBJECTS)
 	mkdir -p $(MODDIR)
-	mv zforp_module.mod $(MODDIR)
+	mv $(MODULES) $(MODDIR)
 
-.f.o:
+.f90.o:
 	$(FC) $(FFLAGS) -c $<
diff --git a/fortran/zfp.f b/fortran/zfp.f90
similarity index 78%
rename from fortran/zfp.f
rename to fortran/zfp.f90
index 3ce9563c..f671d144 100644
--- a/fortran/zfp.f
+++ b/fortran/zfp.f90
@@ -1,6 +1,6 @@
-module zFORp_module
+module zfp
 
-  use, intrinsic :: iso_c_binding, only: c_int, c_int64_t, c_size_t, c_double, c_ptr, c_null_ptr, c_loc
+  use, intrinsic :: iso_c_binding, only: c_int, c_int64_t, c_size_t, c_ptrdiff_t, c_double, c_ptr, c_null_ptr, c_loc
   implicit none
   private
 
@@ -20,6 +20,11 @@ module zFORp_module
     type(c_ptr) :: object = c_null_ptr
   end type zFORp_field
 
+  type, bind(c) :: zFORp_config
+    private
+    type(c_ptr) :: object = c_null_ptr
+  end type zFORp_config
+
   enum, bind(c)
     enumerator :: zFORp_type_none = 0, &
                   zFORp_type_int32 = 1, &
@@ -46,28 +51,31 @@ module zFORp_module
   ! constants are hardcoded
   ! const_xyz holds value, but xyz is the public constant
 
-  integer, parameter :: const_zFORp_version_major = 0
-  integer, parameter :: const_zFORp_version_minor = 5
-  integer, parameter :: const_zFORp_version_patch = 5
+  integer, parameter :: const_zFORp_version_major = 1
+  integer, parameter :: const_zFORp_version_minor = 0
+  integer, parameter :: const_zFORp_version_patch = 0
+  integer, parameter :: const_zFORp_version_tweak = 0
   integer, protected, bind(c, name="zFORp_version_major") :: zFORp_version_major
   integer, protected, bind(c, name="zFORp_version_minor") :: zFORp_version_minor
   integer, protected, bind(c, name="zFORp_version_patch") :: zFORp_version_patch
+  integer, protected, bind(c, name="zFORp_version_tweak") :: zFORp_version_tweak
   data zFORp_version_major/const_zFORp_version_major/, &
        zFORp_version_minor/const_zFORp_version_minor/, &
-       zFORp_version_patch/const_zFORp_version_patch/
+       zFORp_version_patch/const_zFORp_version_patch/, &
+       zFORp_version_tweak/const_zFORp_version_tweak/
 
   integer, parameter :: const_zFORp_codec_version = 5
   integer, protected, bind(c, name="zFORp_codec_version") :: zFORp_codec_version
   data zFORp_codec_version/const_zFORp_codec_version/
 
-  integer, parameter :: const_zFORp_library_version = 85 ! 0x55
+  integer, parameter :: const_zFORp_library_version = 4096 ! 0x1000
   integer, protected, bind(c, name="zFORp_library_version") :: zFORp_library_version
   data zFORp_library_version/const_zFORp_library_version/
 
-  character(len = 36), parameter :: zFORp_version_string = 'zfp version 0.5.5 (May 5, 2019)'
+  character(len = 36), parameter :: zFORp_version_string = 'zfp version 1.0.0 (August 1, 2022)'
 
   integer, parameter :: const_zFORp_min_bits = 1
-  integer, parameter :: const_zFORp_max_bits = 16657
+  integer, parameter :: const_zFORp_max_bits = 16658
   integer, parameter :: const_zFORp_max_prec = 64
   integer, parameter :: const_zFORp_min_exp = -1074
   integer, protected, bind(c, name="zFORp_min_bits") :: zFORp_min_bits
@@ -135,8 +143,8 @@ subroutine zfp_bitstream_stream_close(bs) bind(c, name="stream_close")
 
     function zfp_type_size(scalar_type) result(type_size) bind(c, name="zfp_type_size")
       import
-      integer(c_int) scalar_type
-      integer(c_size_t) type_size
+      integer(c_int), value :: scalar_type
+      integer(c_size_t) :: type_size
     end function
 
     ! high-level API: zfp_stream functions
@@ -164,10 +172,29 @@ function zfp_stream_compression_mode(stream) result(zfp_mode) bind(c, name="zfp_
       integer(c_int) :: zfp_mode
     end function
 
+    function zfp_stream_rate(stream, dims) result(rate_result) bind(c, name="zfp_stream_rate")
+      import
+      type(c_ptr), value :: stream
+      integer(c_int), value :: dims
+      real(c_double) :: rate_result
+    end function
+
+    function zfp_stream_precision(stream) result(prec_result) bind(c, name="zfp_stream_precision")
+      import
+      type(c_ptr), value :: stream
+      integer(c_int) :: prec_result
+    end function
+
+    function zfp_stream_accuracy(stream) result(acc_result) bind(c, name="zfp_stream_accuracy")
+      import
+      type(c_ptr), value :: stream
+      real(c_double) :: acc_result
+    end function
+
     function zfp_stream_mode(stream) result(encoded_mode) bind(c, name="zfp_stream_mode")
       import
       type(c_ptr), value :: stream
-      integer(c_int64_t) encoded_mode
+      integer(c_int64_t) :: encoded_mode
     end function
 
     subroutine zfp_stream_params(stream, minbits, maxbits, maxprec, minexp) bind(c, name="zfp_stream_params")
@@ -188,6 +215,11 @@ function zfp_stream_maximum_size(stream, field) result(max_size) bind(c, name="z
       integer(c_size_t) max_size
     end function
 
+    subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
+      import
+      type(c_ptr), value :: stream
+    end subroutine
+
     subroutine zfp_stream_set_bit_stream(stream, bs) bind(c, name="zfp_stream_set_bit_stream")
       import
       type(c_ptr), value :: stream, bs
@@ -198,13 +230,13 @@ subroutine zfp_stream_set_reversible(stream) bind(c, name="zfp_stream_set_revers
       type(c_ptr), value :: stream
     end subroutine
 
-    function zfp_stream_set_rate(stream, rate, scalar_type, dims, wra) result(rate_result) bind(c, name="zfp_stream_set_rate")
+    function zfp_stream_set_rate(stream, rate, scalar_type, dims, align) result(rate_result) bind(c, name="zfp_stream_set_rate")
       import
       type(c_ptr), value :: stream
       real(c_double), value :: rate
       integer(c_int), value :: scalar_type
       ! no unsigned int in Fortran
-      integer(c_int), value :: dims, wra
+      integer(c_int), value :: dims, align
       real(c_double) :: rate_result
     end function
 
@@ -275,6 +307,15 @@ function zfp_stream_set_omp_chunk_size(stream, chunk_size) result(is_success) bi
       integer(c_int) chunk_size, is_success
     end function
 
+    ! TODO: high-level API: zfp_config functions (resolve Fortran's lack of unions)
+
+    ! zfp_config_none
+    ! zfp_config_rate
+    ! zfp_config_precision
+    ! zfp_config_accuracy
+    ! zfp_config_reversible
+    ! zfp_config_expert
+
     ! high-level API: zfp_field functions
 
     function zfp_field_alloc() result(field) bind(c, name="zfp_field_alloc")
@@ -286,28 +327,32 @@ function zfp_field_1d(uncompressed_ptr, scalar_type, nx) result(field) bind(c, n
       import
       type(c_ptr), value :: uncompressed_ptr
       type(c_ptr) :: field
-      integer(c_int), value :: scalar_type, nx
+      integer(c_int), value :: scalar_type
+      integer(c_size_t), value :: nx
     end function
 
     function zfp_field_2d(uncompressed_ptr, scalar_type, nx, ny) result(field) bind(c, name="zfp_field_2d")
       import
       type(c_ptr), value :: uncompressed_ptr
       type(c_ptr) :: field
-      integer(c_int), value :: scalar_type, nx, ny
+      integer(c_int), value :: scalar_type
+      integer(c_size_t), value :: nx, ny
     end function
 
     function zfp_field_3d(uncompressed_ptr, scalar_type, nx, ny, nz) result(field) bind(c, name="zfp_field_3d")
       import
       type(c_ptr), value :: uncompressed_ptr
       type(c_ptr) :: field
-      integer(c_int), value :: scalar_type, nx, ny, nz
+      integer(c_int), value :: scalar_type
+      integer(c_size_t), value :: nx, ny, nz
     end function
 
     function zfp_field_4d(uncompressed_ptr, scalar_type, nx, ny, nz, nw) result(field) bind(c, name="zfp_field_4d")
       import
       type(c_ptr), value :: uncompressed_ptr
       type(c_ptr) :: field
-      integer(c_int), value :: scalar_type, nx, ny, nz, nw
+      integer(c_int), value :: scalar_type
+      integer(c_size_t), value :: nx, ny, nz, nw
     end function
 
     subroutine zfp_field_free(field) bind(c, name="zfp_field_free")
@@ -321,40 +366,64 @@ function zfp_field_pointer(field) result(arr_ptr) bind(c, name="zfp_field_pointe
       type(c_ptr) :: arr_ptr
     end function
 
+    function zfp_field_begin(field) result(begin_ptr) bind(c, name="zfp_field_begin")
+      import
+      type(c_ptr), value :: field
+      type(c_ptr) :: begin_ptr
+    end function
+
     function zfp_field_type(field) result(scalar_type) bind(c, name="zfp_field_type")
       import
       type(c_ptr), value :: field
-      integer(c_int) scalar_type
+      integer(c_int) :: scalar_type
     end function
 
     function zfp_field_precision(field) result(prec) bind(c, name="zfp_field_precision")
       import
       type(c_ptr), value :: field
-      integer(c_int) prec
+      integer(c_int) :: prec
     end function
 
     function zfp_field_dimensionality(field) result(dims) bind(c, name="zfp_field_dimensionality")
       import
       type(c_ptr), value :: field
-      integer(c_int) dims
+      integer(c_int) :: dims
     end function
 
     function zfp_field_size(field, size_arr) result(total_size) bind(c, name="zfp_field_size")
       import
       type(c_ptr), value :: field, size_arr
-      integer(c_size_t) total_size
+      integer(c_size_t) :: total_size
+    end function
+
+    function zfp_field_size_bytes(field) result(byte_size) bind(c, name="zfp_field_size_bytes")
+      import
+      type(c_ptr), value :: field
+      integer(c_size_t) :: byte_size
+    end function
+
+    function zfp_field_blocks(field) result(blocks) bind(c, name="zfp_field_blocks")
+      import
+      type(c_ptr), value :: field
+      integer(c_size_t) :: blocks
     end function
 
     function zfp_field_stride(field, stride_arr) result(is_strided) bind(c, name="zfp_field_stride")
       import
       type(c_ptr), value :: field, stride_arr
-      integer(c_int) is_strided
+      integer(c_int) :: is_strided
+    end function
+
+    function zfp_field_is_contiguous(field) result(is_contiguous) bind(c, name="zfp_field_is_contiguous")
+      import
+      type(c_ptr), value :: field
+      integer(c_int) :: is_contiguous
     end function
 
     function zfp_field_metadata(field) result(encoded_metadata) bind(c, name="zfp_field_metadata")
       import
       type(c_ptr), value :: field
-      integer(c_int64_t) encoded_metadata
+      integer(c_int64_t) :: encoded_metadata
     end function
 
     subroutine zfp_field_set_pointer(field, arr_ptr) bind(c, name="zfp_field_set_pointer")
@@ -371,56 +440,56 @@ function zfp_field_set_type(field, scalar_type) result(scalar_type_result) bind(
     subroutine zfp_field_set_size_1d(field, nx) bind(c, name="zfp_field_set_size_1d")
       import
       type(c_ptr), value :: field
-      integer(c_int) nx
+      integer(c_size_t) :: nx
     end subroutine
 
     subroutine zfp_field_set_size_2d(field, nx, ny) bind(c, name="zfp_field_set_size_2d")
       import
       type(c_ptr), value :: field
-      integer(c_int) nx, ny
+      integer(c_size_t) :: nx, ny
     end subroutine
 
     subroutine zfp_field_set_size_3d(field, nx, ny, nz) bind(c, name="zfp_field_set_size_3d")
       import
       type(c_ptr), value :: field
-      integer(c_int) nx, ny, nz
+      integer(c_size_t) :: nx, ny, nz
     end subroutine
 
     subroutine zfp_field_set_size_4d(field, nx, ny, nz, nw) bind(c, name="zfp_field_set_size_4d")
       import
       type(c_ptr), value :: field
-      integer(c_int) nx, ny, nz, nw
+      integer(c_size_t) :: nx, ny, nz, nw
     end subroutine
 
     subroutine zfp_field_set_stride_1d(field, sx) bind(c, name="zfp_field_set_stride_1d")
       import
       type(c_ptr), value :: field
-      integer(c_int) sx
+      integer(c_ptrdiff_t) :: sx
     end subroutine
 
     subroutine zfp_field_set_stride_2d(field, sx, sy) bind(c, name="zfp_field_set_stride_2d")
       import
       type(c_ptr), value :: field
-      integer(c_int) sx, sy
+      integer(c_ptrdiff_t) :: sx, sy
     end subroutine
 
     subroutine zfp_field_set_stride_3d(field, sx, sy, sz) bind(c, name="zfp_field_set_stride_3d")
       import
       type(c_ptr), value :: field
-      integer(c_int) sx, sy, sz
+      integer(c_ptrdiff_t) :: sx, sy, sz
     end subroutine
 
     subroutine zfp_field_set_stride_4d(field, sx, sy, sz, sw) bind(c, name="zfp_field_set_stride_4d")
       import
       type(c_ptr), value :: field
-      integer(c_int) sx, sy, sz, sw
+      integer(c_ptrdiff_t) :: sx, sy, sz, sw
     end subroutine
 
     function zfp_field_set_metadata(field, encoded_metadata) result(is_success) bind(c, name="zfp_field_set_metadata")
       import
       type(c_ptr), value :: field
       integer(c_int64_t) :: encoded_metadata
-      integer(c_int) is_success
+      integer(c_int) :: is_success
     end function
 
     ! high-level API: compression and decompression
@@ -440,30 +509,25 @@ function zfp_decompress(stream, field) result(bitstream_offset_bytes) bind(c, na
     function zfp_write_header(stream, field, mask) result(num_bits_written) bind(c, name="zfp_write_header")
       import
       type(c_ptr), value :: stream, field
-      integer(c_int) mask
-      integer(c_size_t) num_bits_written
+      integer(c_int) :: mask
+      integer(c_size_t) :: num_bits_written
     end function
 
     function zfp_read_header(stream, field, mask) result(num_bits_read) bind(c, name="zfp_read_header")
       import
       type(c_ptr), value :: stream, field
-      integer(c_int) mask
-      integer(c_size_t) num_bits_read
+      integer(c_int) :: mask
+      integer(c_size_t) :: num_bits_read
     end function
 
-    ! low-level API: stream manipulation
-    subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
-      import
-      type(c_ptr), value :: stream
-    end subroutine
-
   end interface
 
   ! types
 
   public :: zFORp_bitstream, &
             zFORp_stream, &
-            zFORp_field
+            zFORp_field, &
+            zFORp_config
 
   ! enums
 
@@ -477,7 +541,8 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
             zFORp_mode_expert, &
             zFORp_mode_fixed_rate, &
             zFORp_mode_fixed_precision, &
-            zFORp_mode_fixed_accuracy
+            zFORp_mode_fixed_accuracy, &
+            zFORp_mode_reversible
 
   public :: zFORp_exec_serial, &
             zFORp_exec_omp, &
@@ -486,7 +551,8 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
   ! C macros -> constants
   public :: zFORp_version_major, &
             zFORp_version_minor, &
-            zFORp_version_patch
+            zFORp_version_patch, &
+            zFORp_version_tweak
 
   public :: zFORp_codec_version, &
             zFORp_library_version, &
@@ -524,10 +590,14 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
             zFORp_stream_close, &
             zFORp_stream_bit_stream, &
             zFORp_stream_compression_mode, &
+            zFORp_stream_rate, &
+            zFORp_stream_precision, &
+            zFORp_stream_accuracy, &
             zFORp_stream_mode, &
             zFORp_stream_params, &
             zFORp_stream_compressed_size, &
             zFORp_stream_maximum_size, &
+            zFORp_stream_rewind, &
             zFORp_stream_set_bit_stream, &
             zFORp_stream_set_reversible, &
             zFORp_stream_set_rate, &
@@ -537,6 +607,7 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
             zFORp_stream_set_params
 
   ! high-level API: execution policy functions
+
   public :: zFORp_stream_execution, &
             zFORp_stream_omp_threads, &
             zFORp_stream_omp_chunk_size, &
@@ -544,6 +615,15 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
             zFORp_stream_set_omp_threads, &
             zFORp_stream_set_omp_chunk_size
 
+  ! TODO: high-level API: compression mode and parameter settings
+
+  ! public :: zFORp_config_none, &
+  !           zFORp_config_rate, &
+  !           zFORp_config_precision, &
+  !           zFORp_config_accuracy, &
+  !           zFORp_config_reversible, &
+  !           zFORp_config_expert
+
   ! high-level API: zfp_field functions
 
   public :: zFORp_field_alloc, &
@@ -553,11 +633,15 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
             zFORp_field_4d, &
             zFORp_field_free, &
             zFORp_field_pointer, &
+            zFORp_field_begin, &
             zFORp_field_type, &
             zFORp_field_precision, &
             zFORp_field_dimensionality, &
             zFORp_field_size, &
+            zFORp_field_size_bytes, &
+            zFORp_field_blocks, &
             zFORp_field_stride, &
+            zFORp_field_is_contiguous, &
             zFORp_field_metadata, &
             zFORp_field_set_pointer, &
             zFORp_field_set_type, &
@@ -578,10 +662,6 @@ subroutine zfp_stream_rewind(stream) bind(c, name="zfp_stream_rewind")
             zFORp_write_header, &
             zFORp_read_header
 
-  ! low-level API: stream manipulation
-
-  public :: zFORp_stream_rewind
-
 contains
 
   ! minimal bitstream API
@@ -605,7 +685,7 @@ end subroutine zFORp_bitstream_stream_close
   function zFORp_type_size(scalar_type) result(type_size) bind(c, name="zforp_type_size")
     implicit none
     integer, intent(in) :: scalar_type
-    integer (kind=8) type_size
+    integer (kind=8) :: type_size
     type_size = zfp_type_size(int(scalar_type, c_int))
   end function zFORp_type_size
 
@@ -634,14 +714,36 @@ end function zFORp_stream_bit_stream
   function zFORp_stream_compression_mode(stream) result(zfp_mode) bind(c, name="zforp_stream_compression_mode")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    integer zfp_mode
+    integer :: zfp_mode
     zfp_mode = zfp_stream_compression_mode(stream%object)
   end function zFORp_stream_compression_mode
 
+  function zFORp_stream_rate(stream, dims) result(rate_result) bind(c, name="zforp_stream_rate")
+    implicit none
+    type(zFORp_stream), intent(in) :: stream
+    integer, intent(in) :: dims
+    real (kind=8) :: rate_result
+    rate_result = zfp_stream_rate(stream%object, int(dims, c_int))
+  end function zFORp_stream_rate
+
+  function zFORp_stream_precision(stream) result(prec_result) bind(c, name="zforp_stream_precision")
+    implicit none
+    type(zFORp_stream), intent(in) :: stream
+    integer :: prec_result
+    prec_result = zfp_stream_precision(stream%object)
+  end function zFORp_stream_precision
+
+  function zFORp_stream_accuracy(stream) result(acc_result) bind(c, name="zforp_stream_accuracy")
+    implicit none
+    type(zFORp_stream), intent(in) :: stream
+    real (kind=8) :: acc_result
+    acc_result = zfp_stream_accuracy(stream%object)
+  end function zFORp_stream_accuracy
+
   function zFORp_stream_mode(stream) result(encoded_mode) bind(c, name="zforp_stream_mode")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    integer (kind=8) encoded_mode
+    integer (kind=8) :: encoded_mode
     encoded_mode = zfp_stream_mode(stream%object)
   end function zFORp_stream_mode
 
@@ -658,7 +760,7 @@ end subroutine zFORp_stream_params
   function zFORp_stream_compressed_size(stream) result(compressed_size) bind(c, name="zforp_stream_compressed_size")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    integer (kind=8) compressed_size
+    integer (kind=8) :: compressed_size
     compressed_size = zfp_stream_compressed_size(stream%object)
   end function zFORp_stream_compressed_size
 
@@ -666,10 +768,15 @@ function zFORp_stream_maximum_size(stream, field) result(max_size) bind(c, name=
     implicit none
     type(zFORp_stream), intent(in) :: stream
     type(zFORp_field), intent(in) :: field
-    integer (kind=8) max_size
+    integer (kind=8) :: max_size
     max_size = zfp_stream_maximum_size(stream%object, field%object)
   end function zFORp_stream_maximum_size
 
+  subroutine zFORp_stream_rewind(stream) bind(c, name="zforp_stream_rewind")
+    type(zFORp_stream), intent(in) :: stream
+    call zfp_stream_rewind(stream%object)
+  end subroutine zFORp_stream_rewind
+
   subroutine zFORp_stream_set_bit_stream(stream, bs) bind(c, name="zforp_stream_set_bit_stream")
     type(zFORp_stream), intent(in) :: stream
     type(zFORp_bitstream), intent(in) :: bs
@@ -681,38 +788,38 @@ subroutine zFORp_stream_set_reversible(stream) bind(c, name="zforp_stream_set_re
     call zfp_stream_set_reversible(stream%object)
   end subroutine zFORp_stream_set_reversible
 
-  function zFORp_stream_set_rate(stream, rate, scalar_type, dims, wra) result(rate_result) bind(c, name="zforp_stream_set_rate")
+  function zFORp_stream_set_rate(stream, rate, scalar_type, dims, align) result(rate_result) bind(c, name="zforp_stream_set_rate")
     implicit none
     type(zFORp_stream), intent(in) :: stream
     real (kind=8), intent(in) :: rate
     integer, intent(in) :: scalar_type
-    integer, intent(in) :: dims, wra
+    integer, intent(in) :: dims, align
     real (kind=8) :: rate_result
     rate_result = zfp_stream_set_rate(stream%object, real(rate, c_double), &
-      int(scalar_type, c_int), int(dims, c_int), int(wra, c_int))
+      int(scalar_type, c_int), int(dims, c_int), int(align, c_int))
   end function zFORp_stream_set_rate
 
   function zFORp_stream_set_precision(stream, prec) result(prec_result) bind(c, name="zforp_stream_set_precision")
     implicit none
     type(zFORp_stream), intent(in) :: stream
     integer, intent(in) :: prec
-    integer prec_result
+    integer :: prec_result
     prec_result = zfp_stream_set_precision(stream%object, int(prec, c_int))
   end function zFORp_stream_set_precision
 
-  function zFORp_stream_set_accuracy(stream, acc) result(acc_result) bind(c, name="zforp_stream_set_accuracy")
+  function zFORp_stream_set_accuracy(stream, tolerance) result(acc_result) bind(c, name="zforp_stream_set_accuracy")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    real (kind=8), intent(in) :: acc
-    real (kind=8) acc_result
-    acc_result = zfp_stream_set_accuracy(stream%object, real(acc, c_double))
+    real (kind=8), intent(in) :: tolerance
+    real (kind=8) :: acc_result
+    acc_result = zfp_stream_set_accuracy(stream%object, real(tolerance, c_double))
   end function zFORp_stream_set_accuracy
 
   function zFORp_stream_set_mode(stream, encoded_mode) result(mode_result) bind(c, name="zforp_stream_set_mode")
     implicit none
     type(zFORp_stream), intent(in) :: stream
     integer (kind=8), intent(in) :: encoded_mode
-    integer mode_result
+    integer :: mode_result
     mode_result = zfp_stream_set_mode(stream%object, int(encoded_mode, c_int64_t))
   end function zFORp_stream_set_mode
 
@@ -721,7 +828,7 @@ function zFORp_stream_set_params(stream, minbits, maxbits, maxprec, minexp) resu
     implicit none
     type(zFORp_stream), intent(in) :: stream
     integer, intent(in) :: minbits, maxbits, maxprec, minexp
-    integer is_success
+    integer :: is_success
     is_success = zfp_stream_set_params(stream%object, &
                                        int(minbits, c_int), &
                                        int(maxbits, c_int), &
@@ -734,21 +841,21 @@ end function zFORp_stream_set_params
   function zFORp_stream_execution(stream) result(execution_policy) bind(c, name="zforp_stream_execution")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    integer execution_policy
+    integer :: execution_policy
     execution_policy = zfp_stream_execution(stream%object)
   end function zFORp_stream_execution
 
   function zFORp_stream_omp_threads(stream) result(thread_count) bind(c, name="zforp_stream_omp_threads")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    integer thread_count
+    integer :: thread_count
     thread_count = zfp_stream_omp_threads(stream%object)
   end function zFORp_stream_omp_threads
 
   function zFORp_stream_omp_chunk_size(stream) result(chunk_size_blocks) bind(c, name="zforp_stream_omp_chunk_size")
     implicit none
     type(zFORp_stream), intent(in) :: stream
-    integer (kind=8) chunk_size_blocks
+    integer (kind=8) :: chunk_size_blocks
     chunk_size_blocks = zfp_stream_omp_chunk_size(stream%object)
   end function zFORp_stream_omp_chunk_size
 
@@ -756,7 +863,7 @@ function zFORp_stream_set_execution(stream, execution_policy) result(is_success)
     implicit none
     type(zFORp_stream), intent(in) :: stream
     integer, intent(in) :: execution_policy
-    integer is_success
+    integer :: is_success
     is_success = zfp_stream_set_execution(stream%object, int(execution_policy, c_int))
   end function zFORp_stream_set_execution
 
@@ -764,7 +871,7 @@ function zFORp_stream_set_omp_threads(stream, thread_count) result(is_success) b
     implicit none
     type(zFORp_stream), intent(in) :: stream
     integer, intent(in) :: thread_count
-    integer is_success
+    integer :: is_success
     is_success = zfp_stream_set_omp_threads(stream%object, int(thread_count, c_int))
   end function zFORp_stream_set_omp_threads
 
@@ -773,15 +880,24 @@ function zFORp_stream_set_omp_chunk_size(stream, chunk_size) result(is_success)
     implicit none
     type(zFORp_stream), intent(in) :: stream
     integer, intent(in) :: chunk_size
-    integer is_success
+    integer :: is_success
     is_success = zfp_stream_set_omp_chunk_size(stream%object, int(chunk_size, c_int))
   end function zFORp_stream_set_omp_chunk_size
 
+  ! TODO: high-level API: compression mode and parameter settings
+
+  ! zfp_config_none
+  ! zfp_config_rate
+  ! zfp_config_precision
+  ! zfp_config_accuracy
+  ! zfp_config_reversible
+  ! zfp_config_expert
+
   ! high-level API: zfp_field functions
 
   function zFORp_field_alloc() result(field) bind(c, name="zforp_field_alloc")
     implicit none
-    type(zFORp_field) field
+    type(zFORp_field) :: field
     field%object = zfp_field_alloc()
   end function zFORp_field_alloc
 
@@ -789,38 +905,38 @@ function zFORp_field_1d(uncompressed_ptr, scalar_type, nx) result(field) bind(c,
     implicit none
     type(c_ptr), intent(in) :: uncompressed_ptr
     integer, intent(in) :: scalar_type, nx
-    type(zFORp_field) field
+    type(zFORp_field) :: field
     field%object = zfp_field_1d(uncompressed_ptr, int(scalar_type, c_int), &
-                                    int(nx, c_int))
+                                    int(nx, c_size_t))
   end function zFORp_field_1d
 
   function zFORp_field_2d(uncompressed_ptr, scalar_type, nx, ny) result(field) bind(c, name="zforp_field_2d")
     implicit none
     type(c_ptr), intent(in) :: uncompressed_ptr
     integer, intent(in) :: scalar_type, nx, ny
-    type(zFORp_field) field
+    type(zFORp_field) :: field
     field%object = zfp_field_2d(uncompressed_ptr, int(scalar_type, c_int), &
-                                    int(nx, c_int), int(ny, c_int))
+                                    int(nx, c_size_t), int(ny, c_size_t))
   end function zFORp_field_2d
 
   function zFORp_field_3d(uncompressed_ptr, scalar_type, nx, ny, nz) result(field) bind(c, name="zforp_field_3d")
     implicit none
     type(c_ptr), intent(in) :: uncompressed_ptr
     integer, intent(in) :: scalar_type, nx, ny, nz
-    type(zFORp_field) field
+    type(zFORp_field) :: field
     field%object = zfp_field_3d(uncompressed_ptr, int(scalar_type, c_int), &
-                                    int(nx, c_int), int(ny, c_int), &
-                                    int(nz, c_int))
+                                    int(nx, c_size_t), int(ny, c_size_t), &
+                                    int(nz, c_size_t))
   end function zFORp_field_3d
 
   function zFORp_field_4d(uncompressed_ptr, scalar_type, nx, ny, nz, nw) result(field) bind(c, name="zforp_field_4d")
     implicit none
     type(c_ptr), intent(in) :: uncompressed_ptr
     integer, intent(in) :: scalar_type, nx, ny, nz, nw
-    type(zFORp_field) field
+    type(zFORp_field) :: field
     field%object = zfp_field_4d(uncompressed_ptr, int(scalar_type, c_int), &
-                                    int(nx, c_int), int(ny, c_int), &
-                                    int(nz, c_int), int(nw, c_int))
+                                    int(nx, c_size_t), int(ny, c_size_t), &
+                                    int(nz, c_size_t), int(nw, c_size_t))
   end function zFORp_field_4d
 
   subroutine zFORp_field_free(field) bind(c, name="zforp_field_free")
@@ -832,28 +948,35 @@ end subroutine zFORp_field_free
   function zFORp_field_pointer(field) result(arr_ptr) bind(c, name="zforp_field_pointer")
     implicit none
     type(zFORp_field), intent(in) :: field
-    type(c_ptr) arr_ptr
+    type(c_ptr) :: arr_ptr
     arr_ptr = zfp_field_pointer(field%object)
   end function zFORp_field_pointer
 
+  function zFORp_field_begin(field) result(begin_ptr) bind(c, name="zforp_field_begin")
+    implicit none
+    type(zFORp_field), intent(in) :: field
+    type(c_ptr) :: begin_ptr
+    begin_ptr = zfp_field_begin(field%object)
+  end function zFORp_field_begin
+
   function zFORp_field_type(field) result(scalar_type) bind(c, name="zforp_field_type")
     implicit none
     type(zFORp_field), intent(in) :: field
-    integer scalar_type
+    integer :: scalar_type
     scalar_type = zfp_field_type(field%object)
   end function zFORp_field_type
 
   function zFORp_field_precision(field) result(prec) bind(c, name="zforp_field_precision")
     implicit none
     type(zFORp_field), intent(in) :: field
-    integer prec
+    integer :: prec
     prec = zfp_field_precision(field%object)
   end function zFORp_field_precision
 
   function zFORp_field_dimensionality(field) result(dims) bind(c, name="zforp_field_dimensionality")
     implicit none
     type(zFORp_field), intent(in) :: field
-    integer dims
+    integer :: dims
     dims = zfp_field_dimensionality(field%object)
   end function zFORp_field_dimensionality
 
@@ -861,22 +984,43 @@ function zFORp_field_size(field, size_arr) result(total_size) bind(c, name="zfor
     implicit none
     type(zFORp_field), intent(in) :: field
     integer, dimension(4), target, intent(inout) :: size_arr
-    integer (kind=8) total_size
+    integer (kind=8) :: total_size
     total_size = zfp_field_size(field%object, c_loc(size_arr))
   end function zFORp_field_size
 
+  function zFORp_field_size_bytes(field) result(byte_size) bind(c, name="zforp_field_size_bytes")
+    implicit none
+    type(zFORp_field), intent(in) :: field
+    integer (kind=8) :: byte_size
+    byte_size = zfp_field_size_bytes(field%object)
+  end function zFORp_field_size_bytes
+
+  function zFORp_field_blocks(field) result(blocks) bind(c, name="zforp_field_blocks")
+    implicit none
+    type(zFORp_field), intent(in) :: field
+    integer (kind=8) :: blocks
+    blocks = zfp_field_blocks(field%object)
+  end function zFORp_field_blocks
+
   function zFORp_field_stride(field, stride_arr) result(is_strided) bind(c, name="zforp_field_stride")
     implicit none
     type(zFORp_field), intent(in) :: field
     integer, dimension(4), target, intent(inout) :: stride_arr
-    integer is_strided
+    integer :: is_strided
     is_strided = zfp_field_stride(field%object, c_loc(stride_arr))
   end function zFORp_field_stride
 
+  function zFORp_field_is_contiguous(field) result(is_contiguous) bind(c, name="zforp_field_is_contiguous")
+    implicit none
+    type(zFORp_field), intent(in) :: field
+    integer :: is_contiguous
+    is_contiguous = zfp_field_is_contiguous(field%object)
+  end function zFORp_field_is_contiguous
+
   function zFORp_field_metadata(field) result(encoded_metadata) bind(c, name="zforp_field_metadata")
     implicit none
     type(zFORp_field), intent(in) :: field
-    integer (kind=8) encoded_metadata
+    integer (kind=8) :: encoded_metadata
     encoded_metadata = zfp_field_metadata(field%object)
   end function zFORp_field_metadata
 
@@ -890,63 +1034,64 @@ function zFORp_field_set_type(field, scalar_type) result(scalar_type_result) bin
     implicit none
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: scalar_type
-    integer scalar_type_result
+    integer :: scalar_type_result
     scalar_type_result = zfp_field_set_type(field%object, int(scalar_type, c_int))
   end function zFORp_field_set_type
 
   subroutine zFORp_field_set_size_1d(field, nx) bind(c, name="zforp_field_set_size_1d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: nx
-    call zfp_field_set_size_1d(field%object, int(nx, c_int))
+    call zfp_field_set_size_1d(field%object, int(nx, c_size_t))
   end subroutine zFORp_field_set_size_1d
 
   subroutine zFORp_field_set_size_2d(field, nx, ny) bind(c, name="zforp_field_set_size_2d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: nx, ny
-    call zfp_field_set_size_2d(field%object, int(nx, c_int), int(ny, c_int))
+    call zfp_field_set_size_2d(field%object, int(nx, c_size_t), int(ny, c_size_t))
   end subroutine zFORp_field_set_size_2d
 
   subroutine zFORp_field_set_size_3d(field, nx, ny, nz) bind(c, name="zforp_field_set_size_3d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: nx, ny, nz
-    call zfp_field_set_size_3d(field%object, int(nx, c_int), int(ny, c_int), int(nz, c_int))
+    call zfp_field_set_size_3d(field%object, int(nx, c_size_t), int(ny, c_size_t), int(nz, c_size_t))
   end subroutine zFORp_field_set_size_3d
 
   subroutine zFORp_field_set_size_4d(field, nx, ny, nz, nw) bind(c, name="zforp_field_set_size_4d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: nx, ny, nz, nw
-    call zfp_field_set_size_4d(field%object, int(nx, c_int), int(ny, c_int), int(nz, c_int), int(nw, c_int))
+    call zfp_field_set_size_4d(field%object, int(nx, c_size_t), int(ny, c_size_t), int(nz, c_size_t), int(nw, c_size_t))
   end subroutine zFORp_field_set_size_4d
 
   subroutine zFORp_field_set_stride_1d(field, sx) bind(c, name="zforp_field_set_stride_1d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: sx
-    call zfp_field_set_stride_1d(field%object, int(sx, c_int))
+    call zfp_field_set_stride_1d(field%object, int(sx, c_ptrdiff_t))
   end subroutine zFORp_field_set_stride_1d
 
   subroutine zFORp_field_set_stride_2d(field, sx, sy) bind(c, name="zforp_field_set_stride_2d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: sx, sy
-    call zfp_field_set_stride_2d(field%object, int(sx, c_int), int(sy, c_int))
+    call zfp_field_set_stride_2d(field%object, int(sx, c_ptrdiff_t), int(sy, c_ptrdiff_t))
   end subroutine zFORp_field_set_stride_2d
 
   subroutine zFORp_field_set_stride_3d(field, sx, sy, sz) bind(c, name="zforp_field_set_stride_3d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: sx, sy, sz
-    call zfp_field_set_stride_3d(field%object, int(sx, c_int), int(sy, c_int), int(sz, c_int))
+    call zfp_field_set_stride_3d(field%object, int(sx, c_ptrdiff_t), int(sy, c_ptrdiff_t), int(sz, c_ptrdiff_t))
   end subroutine zFORp_field_set_stride_3d
 
   subroutine zFORp_field_set_stride_4d(field, sx, sy, sz, sw) bind(c, name="zforp_field_set_stride_4d")
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: sx, sy, sz, sw
-    call zfp_field_set_stride_4d(field%object, int(sx, c_int), int(sy, c_int), int(sz, c_int), int(sw, c_int))
+    call zfp_field_set_stride_4d(field%object, int(sx, c_ptrdiff_t), int(sy, c_ptrdiff_t), &
+                                               int(sz, c_ptrdiff_t), int(sw, c_ptrdiff_t))
   end subroutine zFORp_field_set_stride_4d
 
   function zFORp_field_set_metadata(field, encoded_metadata) result(is_success) bind(c, name="zforp_field_set_metadata")
     implicit none
     type(zFORp_field), intent(in) :: field
     integer (kind=8), intent(in) :: encoded_metadata
-    integer is_success
+    integer :: is_success
     is_success = zfp_field_set_metadata(field%object, int(encoded_metadata, c_int64_t))
   end function zFORp_field_set_metadata
 
@@ -956,7 +1101,7 @@ function zFORp_compress(stream, field) result(bitstream_offset_bytes) bind(c, na
     implicit none
     type(zFORp_stream), intent(in) :: stream
     type(zFORp_field), intent(in) :: field
-    integer (kind=8) bitstream_offset_bytes
+    integer (kind=8) :: bitstream_offset_bytes
     bitstream_offset_bytes = zfp_compress(stream%object, field%object)
   end function zFORp_compress
 
@@ -964,7 +1109,7 @@ function zFORp_decompress(stream, field) result(bitstream_offset_bytes) bind(c,
     implicit none
     type(zFORp_stream), intent(in) :: stream
     type(zFORp_field), intent(in) :: field
-    integer (kind=8) bitstream_offset_bytes
+    integer (kind=8) :: bitstream_offset_bytes
     bitstream_offset_bytes = zfp_decompress(stream%object, field%object)
   end function zFORp_decompress
 
@@ -973,7 +1118,7 @@ function zFORp_write_header(stream, field, mask) result(num_bits_written) bind(c
     type(zFORp_stream), intent(in) :: stream
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: mask
-    integer (kind=8) num_bits_written
+    integer (kind=8) :: num_bits_written
     num_bits_written = zfp_write_header(stream%object, field%object, int(mask, c_int))
   end function zFORp_write_header
 
@@ -982,15 +1127,8 @@ function zFORp_read_header(stream, field, mask) result(num_bits_read) bind(c, na
     type(zFORp_stream), intent(in) :: stream
     type(zFORp_field), intent(in) :: field
     integer, intent(in) :: mask
-    integer (kind=8) num_bits_read
+    integer (kind=8) :: num_bits_read
     num_bits_read = zfp_read_header(stream%object, field%object, int(mask, c_int))
   end function zFORp_read_header
 
-  ! low-level API: stream manipulation
-
-  subroutine zFORp_stream_rewind(stream) bind(c, name="zforp_stream_rewind")
-    type(zFORp_stream), intent(in) :: stream
-    call zfp_stream_rewind(stream%object)
-  end subroutine zFORp_stream_rewind
-
-end module zFORp_module
+end module zfp
diff --git a/include/zfp.h b/include/zfp.h
index b2cc25d0..9db4a387 100644
--- a/include/zfp.h
+++ b/include/zfp.h
@@ -1,109 +1,43 @@
 /*
-** Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC.
-** Produced at the Lawrence Livermore National Laboratory.
-** Authors: Peter Lindstrom, Markus Salasoo, Matt Larsen, Stephen Herbein.
-** LLNL-CODE-663824.
-** All rights reserved.
-**
-** This file is part of the zfp library.
-** For details, see http://computation.llnl.gov/casc/zfp/.
-**
-** Redistribution and use in source and binary forms, with or without
-** modification, are permitted provided that the following conditions are met:
-**
-** 1. Redistributions of source code must retain the above copyright notice,
-** this list of conditions and the disclaimer below.
-**
-** 2. Redistributions in binary form must reproduce the above copyright notice,
-** this list of conditions and the disclaimer (as noted below) in the
-** documentation and/or other materials provided with the distribution.
-**
-** 3. Neither the name of the LLNS/LLNL nor the names of its contributors may
-** be used to endorse or promote products derived from this software without
-** specific prior written permission.
-**
-** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-** ARE DISCLAIMED.  IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY,
-** LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-** INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-** (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-** LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-** ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**
-**
-** Additional BSD Notice
-**
-** 1. This notice is required to be provided under our contract with the U.S.
-** Department of Energy (DOE).  This work was produced at Lawrence Livermore
-** National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE.
-
-** 2. Neither the United States Government nor Lawrence Livermore National
-** Security, LLC nor any of their employees, makes any warranty, express or
-** implied, or assumes any liability or responsibility for the accuracy,
-** completeness, or usefulness of any information, apparatus, product, or
-** process disclosed, or represents that its use would not infringe
-** privately-owned rights.
-**
-** 3. Also, reference herein to any specific commercial products, process, or
-** services by trade name, trademark, manufacturer or otherwise does not
-** necessarily constitute or imply its endorsement, recommendation, or
-** favoring by the United States Government or Lawrence Livermore National
-** Security, LLC.  The views and opinions of authors expressed herein do not
-** necessarily state or reflect those of the United States Government or
-** Lawrence Livermore National Security, LLC, and shall not be used for
-** advertising or product endorsement purposes.
+** Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC and
+** other zfp project contributors. See the top-level LICENSE file for details.
+** SPDX-License-Identifier: BSD-3-Clause
 */
 
 #ifndef ZFP_H
 #define ZFP_H
 
-#include "zfp/types.h"
-#include "zfp/system.h"
-#include "bitstream.h"
+#include "zfp/bitstream.h"
+#include "zfp/version.h"
+#include "zfp/internal/zfp/system.h"
+#include "zfp/internal/zfp/types.h"
 
 /* macros ------------------------------------------------------------------ */
 
-/* stringification */
-#define _zfp_str_(x) # x
-#define _zfp_str(x) _zfp_str_(x)
-
-/* library version information */
-#define ZFP_VERSION_MAJOR 0 /* library major version number */
-#define ZFP_VERSION_MINOR 5 /* library minor version number */
-#define ZFP_VERSION_PATCH 5 /* library patch version number */
-#define ZFP_VERSION_RELEASE ZFP_VERSION_PATCH
-
-/* codec version number (see also zfp_codec_version) */
-#define ZFP_CODEC 5
-
-/* library version number (see also zfp_library_version) */
-#define ZFP_VERSION \
-  ((ZFP_VERSION_MAJOR << 8) + \
-   (ZFP_VERSION_MINOR << 4) + \
-   (ZFP_VERSION_PATCH << 0))
-
-/* library version string (see also zfp_version_string) */
-#define ZFP_VERSION_STRING \
-  _zfp_str(ZFP_VERSION_MAJOR) "." \
-  _zfp_str(ZFP_VERSION_MINOR) "." \
-  _zfp_str(ZFP_VERSION_PATCH)
-
 /* default compression parameters */
 #define ZFP_MIN_BITS     1 /* minimum number of bits per block */
-#define ZFP_MAX_BITS 16657 /* maximum number of bits per block */
+#define ZFP_MAX_BITS 16658 /* maximum number of bits per block */
 #define ZFP_MAX_PREC    64 /* maximum precision supported */
 #define ZFP_MIN_EXP  -1074 /* minimum floating-point base-2 exponent */
 
 /* header masks (enable via bitwise or; reader must use same mask) */
+#define ZFP_HEADER_NONE   0x0u /* no header */
 #define ZFP_HEADER_MAGIC  0x1u /* embed 64-bit magic */
 #define ZFP_HEADER_META   0x2u /* embed 52-bit field metadata */
 #define ZFP_HEADER_MODE   0x4u /* embed 12- or 64-bit compression mode */
 #define ZFP_HEADER_FULL   0x7u /* embed all of the above */
 
+/* bit masks for specifying storage class */
+#define ZFP_DATA_UNUSED  0x01u /* allocated but unused storage */
+#define ZFP_DATA_PADDING 0x02u /* padding for alignment purposes */
+#define ZFP_DATA_META    0x04u /* class members and other fixed-size storage */
+#define ZFP_DATA_MISC    0x08u /* miscellaneous uncategorized storage */
+#define ZFP_DATA_PAYLOAD 0x10u /* compressed data */
+#define ZFP_DATA_INDEX   0x20u /* variable-rate block index information */
+#define ZFP_DATA_CACHE   0x40u /* uncompressed cached data */
+#define ZFP_DATA_HEADER  0x80u /* header information */
+#define ZFP_DATA_ALL     0xffu /* all storage */
+
 /* field metadata indeterminate state and error code */
 #define ZFP_META_NULL (UINT64C(-1))
 
@@ -115,8 +49,21 @@
 #define ZFP_HEADER_MAX_BITS 148 /* max number of header bits */
 #define ZFP_MODE_SHORT_MAX  ((1u << ZFP_MODE_SHORT_BITS) - 2)
 
+/* rounding mode for reducing bias; see build option ZFP_ROUNDING_MODE */
+#define ZFP_ROUND_FIRST (-1) /* round during compression */
+#define ZFP_ROUND_NEVER 0    /* never round */
+#define ZFP_ROUND_LAST  1    /* round during decompression */
+
 /* types ------------------------------------------------------------------- */
 
+/* Boolean constants */
+enum {
+  zfp_false = 0,         /* false */
+  zfp_true  = !zfp_false /* true */
+};
+
+typedef int zfp_bool; /* Boolean type */
+
 /* execution policy */
 typedef enum {
   zfp_exec_serial = 0, /* serial execution (default) */
@@ -130,14 +77,9 @@ typedef struct {
   uint chunk_size; /* number of blocks per chunk (1D only) */
 } zfp_exec_params_omp;
 
-/* execution parameters */
-typedef union {
-  zfp_exec_params_omp omp; /* OpenMP parameters */
-} zfp_exec_params;
-
 typedef struct {
   zfp_exec_policy policy; /* execution policy (serial, omp, ...) */
-  zfp_exec_params params; /* execution parameters */
+  void* params;           /* execution parameters */
 } zfp_execution;
 
 /* compressed stream; use accessors to get/set members */
@@ -160,6 +102,22 @@ typedef enum {
   zfp_mode_reversible      = 5  /* reversible (lossless) mode */
 } zfp_mode;
 
+/* compression mode and parameter settings */
+typedef struct {
+  zfp_mode mode;      /* compression mode */
+  union {
+    double rate;      /* compressed bits/value (negative for word alignment) */
+    uint precision;   /* uncompressed bits/value */
+    double tolerance; /* absolute error tolerance */
+    struct {
+      uint minbits;   /* min number of compressed bits/block */
+      uint maxbits;   /* max number of compressed bits/block */
+      uint maxprec;   /* max number of uncompressed bits/value */
+      int minexp;     /* min floating point bit plane number to store */
+    } expert;         /* expert mode arguments */
+  } arg;              /* arguments corresponding to compression mode */
+} zfp_config;
+
 /* scalar type */
 typedef enum {
   zfp_type_none   = 0, /* unspecified type */
@@ -171,10 +129,10 @@ typedef enum {
 
 /* uncompressed array; use accessors to get/set members */
 typedef struct {
-  zfp_type type;       /* scalar type (e.g. int32, double) */
-  uint nx, ny, nz, nw; /* sizes (zero for unused dimensions) */
-  int sx, sy, sz, sw;  /* strides (zero for contiguous array a[nw][nz][ny][nx]) */
-  void* data;          /* pointer to array data */
+  zfp_type type;            /* scalar type (e.g. int32, double) */
+  size_t nx, ny, nz, nw;    /* sizes (zero for unused dimensions) */
+  ptrdiff_t sx, sy, sz, sw; /* strides (zero for contiguous array a[nw][nz][ny][nx]) */
+  void* data;               /* pointer to array data */
 } zfp_field;
 
 #ifdef __cplusplus
@@ -216,12 +174,31 @@ zfp_stream_bit_stream(
   const zfp_stream* stream /* compressed stream */
 );
 
-/* returns enum of compression mode */
-zfp_mode                   /* enum for compression mode */
+/* enumerated compression mode */
+zfp_mode                   /* compression mode or zfp_mode_null if not set */
 zfp_stream_compression_mode(
   const zfp_stream* stream /* compressed stream */
 );
 
+/* rate in compressed bits/scalar (when in fixed-rate mode) */
+double                      /* rate or zero upon failure */
+zfp_stream_rate(
+  const zfp_stream* stream, /* compressed stream */
+  uint dims                 /* array dimensionality (1, 2, 3, or 4) */
+);
+
+/* precision in uncompressed bits/scalar (when in fixed-precision mode) */
+uint                       /* precision or zero upon failure */
+zfp_stream_precision(
+  const zfp_stream* stream /* compressed stream */
+);
+
+/* accuracy as absolute error tolerance (when in fixed-accuracy mode) */
+double                     /* tolerance or zero upon failure */
+zfp_stream_accuracy(
+  const zfp_stream* stream /* compressed stream */
+);
+
 /* get all compression parameters in a compact representation */
 uint64                     /* 12- or 64-bit encoding of parameters */
 zfp_stream_mode(
@@ -279,7 +256,7 @@ zfp_stream_set_rate(
   double rate,        /* desired rate in compressed bits/scalar */
   zfp_type type,      /* scalar type to compress */
   uint dims,          /* array dimensionality (1, 2, 3, or 4) */
-  int wra             /* nonzero if write random access is needed */
+  zfp_bool align      /* word-aligned blocks, e.g., for write random access */
 );
 
 /* set precision in uncompressed bits/scalar (fixed-precision mode) */
@@ -304,7 +281,7 @@ zfp_stream_set_mode(
 );
 
 /* set all parameters (expert mode); leaves stream intact on failure */
-int                   /* nonzero upon success */
+zfp_bool              /* true upon success */
 zfp_stream_set_params(
   zfp_stream* stream, /* compressed stream */
   uint minbits,       /* minimum number of bits per 4^d block */
@@ -334,26 +311,64 @@ zfp_stream_omp_chunk_size(
 );
 
 /* set execution policy */
-int                      /* nonzero upon success */
+zfp_bool                 /* true upon success */
 zfp_stream_set_execution(
   zfp_stream* stream,    /* compressed stream */
   zfp_exec_policy policy /* execution policy */
 );
 
 /* set OpenMP execution policy and number of threads */
-int                   /* nonzero upon success */
+zfp_bool              /* true upon success */
 zfp_stream_set_omp_threads(
   zfp_stream* stream, /* compressed stream */
   uint threads        /* number of OpenMP threads to use (0 for default) */
 );
 
 /* set OpenMP execution policy and number of blocks per chunk (1D only) */
-int                   /* nonzero upon success */
+zfp_bool              /* true upon success */
 zfp_stream_set_omp_chunk_size(
   zfp_stream* stream, /* compressed stream */
   uint chunk_size     /* number of blocks per chunk (0 for default) */
 );
 
+/* high-level API: compression mode and parameter settings ----------------- */
+
+/* unspecified configuration */
+zfp_config /* compression mode and parameter settings */
+zfp_config_none();
+
+/* fixed-rate configuration */
+zfp_config       /* compression mode and parameter settings */
+zfp_config_rate(
+  double rate,   /* desired rate in compressed bits/scalar */
+  zfp_bool align /* word-aligned blocks, e.g., for write random access */
+);
+
+/* fixed-precision configuration */
+zfp_config       /* compression mode and parameter settings */
+zfp_config_precision(
+  uint precision /* desired precision in uncompressed bits/scalar */
+);
+
+/* fixed-accuracy configuration */
+zfp_config         /* compression mode and parameter settings */
+zfp_config_accuracy(
+  double tolerance /* desired error tolerance */
+);
+
+/* reversible (lossless) configuration */
+zfp_config /* compression mode and parameter settings */
+zfp_config_reversible();
+
+/* expert configuration */
+zfp_config      /* compression mode and parameter settings */
+zfp_config_expert(
+  uint minbits, /* minimum number of bits per 4^d block */
+  uint maxbits, /* maximum number of bits per 4^d block */
+  uint maxprec, /* maximum precision (# bit planes coded) */
+  int minexp    /* minimum base-2 exponent; error <= 2^minexp */
+);
+
 /* high-level API: uncompressed array construction/destruction ------------- */
 
 /* allocate field struct */
@@ -365,7 +380,7 @@ zfp_field*       /* allocated field metadata */
 zfp_field_1d(
   void* pointer, /* pointer to uncompressed scalars (may be NULL) */
   zfp_type type, /* scalar type */
-  uint nx        /* number of scalars */
+  size_t nx      /* number of scalars */
 );
 
 /* allocate metadata for 2D field f[ny][nx] */
@@ -373,8 +388,8 @@ zfp_field*       /* allocated field metadata */
 zfp_field_2d(
   void* pointer, /* pointer to uncompressed scalars (may be NULL) */
   zfp_type type, /* scalar type */
-  uint nx,       /* number of scalars in x dimension */
-  uint ny        /* number of scalars in y dimension */
+  size_t nx,     /* number of scalars in x dimension */
+  size_t ny      /* number of scalars in y dimension */
 );
 
 /* allocate metadata for 3D field f[nz][ny][nx] */
@@ -382,9 +397,9 @@ zfp_field*       /* allocated field metadata */
 zfp_field_3d(
   void* pointer, /* pointer to uncompressed scalars (may be NULL) */
   zfp_type type, /* scalar type */
-  uint nx,       /* number of scalars in x dimension */
-  uint ny,       /* number of scalars in y dimension */
-  uint nz        /* number of scalars in z dimension */
+  size_t nx,     /* number of scalars in x dimension */
+  size_t ny,     /* number of scalars in y dimension */
+  size_t nz      /* number of scalars in z dimension */
 );
 
 /* allocate metadata for 4D field f[nw][nz][ny][nx] */
@@ -392,10 +407,10 @@ zfp_field*       /* allocated field metadata */
 zfp_field_4d(
   void* pointer, /* pointer to uncompressed scalars (may be NULL) */
   zfp_type type, /* scalar type */
-  uint nx,       /* number of scalars in x dimension */
-  uint ny,       /* number of scalars in y dimension */
-  uint nz,       /* number of scalars in z dimension */
-  uint nw        /* number of scalars in w dimension */
+  size_t nx,     /* number of scalars in x dimension */
+  size_t ny,     /* number of scalars in y dimension */
+  size_t nz,     /* number of scalars in z dimension */
+  size_t nw      /* number of scalars in w dimension */
 );
 
 /* deallocate field metadata */
@@ -412,6 +427,12 @@ zfp_field_pointer(
   const zfp_field* field /* field metadata */
 );
 
+/* pointer to lowest memory address spanned by field */
+void*
+zfp_field_begin(
+  const zfp_field* field /* field metadata */
+);
+
 /* field scalar type */
 zfp_type                 /* scalar type */
 zfp_field_type(
@@ -424,7 +445,7 @@ zfp_field_precision(
   const zfp_field* field /* field metadata */
 );
 
-/* field dimensionality (1, 2, or 3) */
+/* field dimensionality (1, 2, 3, or 4) */
 uint                     /* number of dimensions */
 zfp_field_dimensionality(
   const zfp_field* field /* field metadata */
@@ -434,14 +455,32 @@ zfp_field_dimensionality(
 size_t                    /* total number of scalars */
 zfp_field_size(
   const zfp_field* field, /* field metadata */
-  uint* size              /* number of scalars per dimension (may be NULL) */
+  size_t* size            /* number of scalars per dimension (may be NULL) */
+);
+
+/* number of bytes spanned by field data including gaps (if any) */
+size_t
+zfp_field_size_bytes(
+  const zfp_field* field /* field metadata */
+);
+
+/* field size in number of blocks */
+size_t                   /* total number of blocks */
+zfp_field_blocks(
+  const zfp_field* field /* field metadata */
 );
 
 /* field strides per dimension */
-int                       /* zero if array is contiguous */
+zfp_bool                  /* true if array is not contiguous */
 zfp_field_stride(
   const zfp_field* field, /* field metadata */
-  int* stride             /* stride in scalars per dimension (may be NULL) */
+  ptrdiff_t* stride       /* stride in scalars per dimension (may be NULL) */
+);
+
+/* field contiguity test */
+zfp_bool                 /* true if field layout is contiguous */
+zfp_field_is_contiguous(
+  const zfp_field* field /* field metadata */
 );
 
 /* field scalar type and dimensions */
@@ -470,72 +509,72 @@ zfp_field_set_type(
 void
 zfp_field_set_size_1d(
   zfp_field* field, /* field metadata */
-  uint nx           /* number of scalars */
+  size_t nx         /* number of scalars */
 );
 
 /* set 2D field size */
 void
 zfp_field_set_size_2d(
   zfp_field* field, /* field metadata */
-  uint nx,          /* number of scalars in x dimension */
-  uint ny           /* number of scalars in y dimension */
+  size_t nx,        /* number of scalars in x dimension */
+  size_t ny         /* number of scalars in y dimension */
 );
 
 /* set 3D field size */
 void
 zfp_field_set_size_3d(
   zfp_field* field, /* field metadata */
-  uint nx,          /* number of scalars in x dimension */
-  uint ny,          /* number of scalars in y dimension */
-  uint nz           /* number of scalars in z dimension */
+  size_t nx,        /* number of scalars in x dimension */
+  size_t ny,        /* number of scalars in y dimension */
+  size_t nz         /* number of scalars in z dimension */
 );
 
 /* set 4D field size */
 void
 zfp_field_set_size_4d(
   zfp_field* field, /* field metadata */
-  uint nx,          /* number of scalars in x dimension */
-  uint ny,          /* number of scalars in y dimension */
-  uint nz,          /* number of scalars in z dimension */
-  uint nw           /* number of scalars in w dimension */
+  size_t nx,        /* number of scalars in x dimension */
+  size_t ny,        /* number of scalars in y dimension */
+  size_t nz,        /* number of scalars in z dimension */
+  size_t nw         /* number of scalars in w dimension */
 );
 
 /* set 1D field stride in number of scalars */
 void
 zfp_field_set_stride_1d(
   zfp_field* field, /* field metadata */
-  int sx            /* stride in number of scalars: &f[1] - &f[0] */
+  ptrdiff_t sx      /* stride in number of scalars: &f[1] - &f[0] */
 );
 
 /* set 2D field strides in number of scalars */
 void
 zfp_field_set_stride_2d(
   zfp_field* field, /* field metadata */
-  int sx,           /* stride in x dimension: &f[0][1] - &f[0][0] */
-  int sy            /* stride in y dimension: &f[1][0] - &f[0][0] */
+  ptrdiff_t sx,     /* stride in x dimension: &f[0][1] - &f[0][0] */
+  ptrdiff_t sy      /* stride in y dimension: &f[1][0] - &f[0][0] */
 );
 
 /* set 3D field strides in number of scalars */
 void
 zfp_field_set_stride_3d(
   zfp_field* field, /* field metadata */
-  int sx,           /* stride in x dimension: &f[0][0][1] - &f[0][0][0] */
-  int sy,           /* stride in y dimension: &f[0][1][0] - &f[0][0][0] */
-  int sz            /* stride in z dimension: &f[1][0][0] - &f[0][0][0] */
+  ptrdiff_t sx,     /* stride in x dimension: &f[0][0][1] - &f[0][0][0] */
+  ptrdiff_t sy,     /* stride in y dimension: &f[0][1][0] - &f[0][0][0] */
+  ptrdiff_t sz      /* stride in z dimension: &f[1][0][0] - &f[0][0][0] */
 );
 
 /* set 4D field strides in number of scalars */
 void
 zfp_field_set_stride_4d(
   zfp_field* field, /* field metadata */
-  int sx,           /* stride in x dimension: &f[0][0][0][1] - &f[0][0][0][0] */
-  int sy,           /* stride in y dimension: &f[0][0][1][0] - &f[0][0][0][0] */
-  int sz,           /* stride in z dimension: &f[0][1][0][0] - &f[0][0][0][0] */
-  int sw            /* stride in w dimension: &f[1][0][0][0] - &f[0][0][0][0] */
+  ptrdiff_t sx,     /* stride in x dimension: &f[0][0][0][1] - &f[0][0][0][0] */
+  ptrdiff_t sy,     /* stride in y dimension: &f[0][0][1][0] - &f[0][0][0][0] */
+  ptrdiff_t sz,     /* stride in z dimension: &f[0][1][0][0] - &f[0][0][0][0] */
+  ptrdiff_t sw      /* stride in w dimension: &f[1][0][0][0] - &f[0][0][0][0] */
 );
 
 /* set field scalar type and dimensions */
-int                 /* nonzero upon success */
+zfp_bool            /* true upon success */
 zfp_field_set_metadata(
   zfp_field* field, /* field metadata */
   uint64 meta       /* compact 52-bit encoding of metadata */
@@ -600,68 +639,68 @@ needed for the compressed block.
 */
 
 /* encode 1D contiguous block of 4 values */
-uint zfp_encode_block_int32_1(zfp_stream* stream, const int32* block);
-uint zfp_encode_block_int64_1(zfp_stream* stream, const int64* block);
-uint zfp_encode_block_float_1(zfp_stream* stream, const float* block);
-uint zfp_encode_block_double_1(zfp_stream* stream, const double* block);
+size_t zfp_encode_block_int32_1(zfp_stream* stream, const int32* block);
+size_t zfp_encode_block_int64_1(zfp_stream* stream, const int64* block);
+size_t zfp_encode_block_float_1(zfp_stream* stream, const float* block);
+size_t zfp_encode_block_double_1(zfp_stream* stream, const double* block);
 
 /* encode 1D complete or partial block from strided array */
-uint zfp_encode_block_strided_int32_1(zfp_stream* stream, const int32* p, int sx);
-uint zfp_encode_block_strided_int64_1(zfp_stream* stream, const int64* p, int sx);
-uint zfp_encode_block_strided_float_1(zfp_stream* stream, const float* p, int sx);
-uint zfp_encode_block_strided_double_1(zfp_stream* stream, const double* p, int sx);
-uint zfp_encode_partial_block_strided_int32_1(zfp_stream* stream, const int32* p, uint nx, int sx);
-uint zfp_encode_partial_block_strided_int64_1(zfp_stream* stream, const int64* p, uint nx, int sx);
-uint zfp_encode_partial_block_strided_float_1(zfp_stream* stream, const float* p, uint nx, int sx);
-uint zfp_encode_partial_block_strided_double_1(zfp_stream* stream, const double* p, uint nx, int sx);
+size_t zfp_encode_block_strided_int32_1(zfp_stream* stream, const int32* p, ptrdiff_t sx);
+size_t zfp_encode_block_strided_int64_1(zfp_stream* stream, const int64* p, ptrdiff_t sx);
+size_t zfp_encode_block_strided_float_1(zfp_stream* stream, const float* p, ptrdiff_t sx);
+size_t zfp_encode_block_strided_double_1(zfp_stream* stream, const double* p, ptrdiff_t sx);
+size_t zfp_encode_partial_block_strided_int32_1(zfp_stream* stream, const int32* p, size_t nx, ptrdiff_t sx);
+size_t zfp_encode_partial_block_strided_int64_1(zfp_stream* stream, const int64* p, size_t nx, ptrdiff_t sx);
+size_t zfp_encode_partial_block_strided_float_1(zfp_stream* stream, const float* p, size_t nx, ptrdiff_t sx);
+size_t zfp_encode_partial_block_strided_double_1(zfp_stream* stream, const double* p, size_t nx, ptrdiff_t sx);
 
 /* encode 2D contiguous block of 4x4 values */
-uint zfp_encode_block_int32_2(zfp_stream* stream, const int32* block);
-uint zfp_encode_block_int64_2(zfp_stream* stream, const int64* block);
-uint zfp_encode_block_float_2(zfp_stream* stream, const float* block);
-uint zfp_encode_block_double_2(zfp_stream* stream, const double* block);
+size_t zfp_encode_block_int32_2(zfp_stream* stream, const int32* block);
+size_t zfp_encode_block_int64_2(zfp_stream* stream, const int64* block);
+size_t zfp_encode_block_float_2(zfp_stream* stream, const float* block);
+size_t zfp_encode_block_double_2(zfp_stream* stream, const double* block);
 
 /* encode 2D complete or partial block from strided array */
-uint zfp_encode_partial_block_strided_int32_2(zfp_stream* stream, const int32* p, uint nx, uint ny, int sx, int sy);
-uint zfp_encode_partial_block_strided_int64_2(zfp_stream* stream, const int64* p, uint nx, uint ny, int sx, int sy);
-uint zfp_encode_partial_block_strided_float_2(zfp_stream* stream, const float* p, uint nx, uint ny, int sx, int sy);
-uint zfp_encode_partial_block_strided_double_2(zfp_stream* stream, const double* p, uint nx, uint ny, int sx, int sy);
-uint zfp_encode_block_strided_int32_2(zfp_stream* stream, const int32* p, int sx, int sy);
-uint zfp_encode_block_strided_int64_2(zfp_stream* stream, const int64* p, int sx, int sy);
-uint zfp_encode_block_strided_float_2(zfp_stream* stream, const float* p, int sx, int sy);
-uint zfp_encode_block_strided_double_2(zfp_stream* stream, const double* p, int sx, int sy);
+size_t zfp_encode_partial_block_strided_int32_2(zfp_stream* stream, const int32* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_partial_block_strided_int64_2(zfp_stream* stream, const int64* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_partial_block_strided_float_2(zfp_stream* stream, const float* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_partial_block_strided_double_2(zfp_stream* stream, const double* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_block_strided_int32_2(zfp_stream* stream, const int32* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_block_strided_int64_2(zfp_stream* stream, const int64* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_block_strided_float_2(zfp_stream* stream, const float* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_encode_block_strided_double_2(zfp_stream* stream, const double* p, ptrdiff_t sx, ptrdiff_t sy);
 
 /* encode 3D contiguous block of 4x4x4 values */
-uint zfp_encode_block_int32_3(zfp_stream* stream, const int32* block);
-uint zfp_encode_block_int64_3(zfp_stream* stream, const int64* block);
-uint zfp_encode_block_float_3(zfp_stream* stream, const float* block);
-uint zfp_encode_block_double_3(zfp_stream* stream, const double* block);
+size_t zfp_encode_block_int32_3(zfp_stream* stream, const int32* block);
+size_t zfp_encode_block_int64_3(zfp_stream* stream, const int64* block);
+size_t zfp_encode_block_float_3(zfp_stream* stream, const float* block);
+size_t zfp_encode_block_double_3(zfp_stream* stream, const double* block);
 
 /* encode 3D complete or partial block from strided array */
-uint zfp_encode_block_strided_int32_3(zfp_stream* stream, const int32* p, int sx, int sy, int sz);
-uint zfp_encode_block_strided_int64_3(zfp_stream* stream, const int64* p, int sx, int sy, int sz);
-uint zfp_encode_block_strided_float_3(zfp_stream* stream, const float* p, int sx, int sy, int sz);
-uint zfp_encode_block_strided_double_3(zfp_stream* stream, const double* p, int sx, int sy, int sz);
-uint zfp_encode_partial_block_strided_int32_3(zfp_stream* stream, const int32* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
-uint zfp_encode_partial_block_strided_int64_3(zfp_stream* stream, const int64* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
-uint zfp_encode_partial_block_strided_float_3(zfp_stream* stream, const float* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
-uint zfp_encode_partial_block_strided_double_3(zfp_stream* stream, const double* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+size_t zfp_encode_block_strided_int32_3(zfp_stream* stream, const int32* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_block_strided_int64_3(zfp_stream* stream, const int64* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_block_strided_float_3(zfp_stream* stream, const float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_block_strided_double_3(zfp_stream* stream, const double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_partial_block_strided_int32_3(zfp_stream* stream, const int32* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_partial_block_strided_int64_3(zfp_stream* stream, const int64* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_partial_block_strided_float_3(zfp_stream* stream, const float* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_encode_partial_block_strided_double_3(zfp_stream* stream, const double* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
 
 /* encode 4D contiguous block of 4x4x4x4 values */
-uint zfp_encode_block_int32_4(zfp_stream* stream, const int32* block);
-uint zfp_encode_block_int64_4(zfp_stream* stream, const int64* block);
-uint zfp_encode_block_float_4(zfp_stream* stream, const float* block);
-uint zfp_encode_block_double_4(zfp_stream* stream, const double* block);
+size_t zfp_encode_block_int32_4(zfp_stream* stream, const int32* block);
+size_t zfp_encode_block_int64_4(zfp_stream* stream, const int64* block);
+size_t zfp_encode_block_float_4(zfp_stream* stream, const float* block);
+size_t zfp_encode_block_double_4(zfp_stream* stream, const double* block);
 
 /* encode 4D complete or partial block from strided array */
-uint zfp_encode_block_strided_int32_4(zfp_stream* stream, const int32* p, int sx, int sy, int sz, int sw);
-uint zfp_encode_block_strided_int64_4(zfp_stream* stream, const int64* p, int sx, int sy, int sz, int sw);
-uint zfp_encode_block_strided_float_4(zfp_stream* stream, const float* p, int sx, int sy, int sz, int sw);
-uint zfp_encode_block_strided_double_4(zfp_stream* stream, const double* p, int sx, int sy, int sz, int sw);
-uint zfp_encode_partial_block_strided_int32_4(zfp_stream* stream, const int32* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
-uint zfp_encode_partial_block_strided_int64_4(zfp_stream* stream, const int64* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
-uint zfp_encode_partial_block_strided_float_4(zfp_stream* stream, const float* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
-uint zfp_encode_partial_block_strided_double_4(zfp_stream* stream, const double* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+size_t zfp_encode_block_strided_int32_4(zfp_stream* stream, const int32* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_block_strided_int64_4(zfp_stream* stream, const int64* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_block_strided_float_4(zfp_stream* stream, const float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_block_strided_double_4(zfp_stream* stream, const double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_partial_block_strided_int32_4(zfp_stream* stream, const int32* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_partial_block_strided_int64_4(zfp_stream* stream, const int64* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_partial_block_strided_float_4(zfp_stream* stream, const float* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_encode_partial_block_strided_double_4(zfp_stream* stream, const double* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
 
 /* low-level API: decoder -------------------------------------------------- */
 
@@ -672,68 +711,68 @@ further details.
 */
 
 /* decode 1D contiguous block of 4 values */
-uint zfp_decode_block_int32_1(zfp_stream* stream, int32* block);
-uint zfp_decode_block_int64_1(zfp_stream* stream, int64* block);
-uint zfp_decode_block_float_1(zfp_stream* stream, float* block);
-uint zfp_decode_block_double_1(zfp_stream* stream, double* block);
+size_t zfp_decode_block_int32_1(zfp_stream* stream, int32* block);
+size_t zfp_decode_block_int64_1(zfp_stream* stream, int64* block);
+size_t zfp_decode_block_float_1(zfp_stream* stream, float* block);
+size_t zfp_decode_block_double_1(zfp_stream* stream, double* block);
 
 /* decode 1D complete or partial block from strided array */
-uint zfp_decode_block_strided_int32_1(zfp_stream* stream, int32* p, int sx);
-uint zfp_decode_block_strided_int64_1(zfp_stream* stream, int64* p, int sx);
-uint zfp_decode_block_strided_float_1(zfp_stream* stream, float* p, int sx);
-uint zfp_decode_block_strided_double_1(zfp_stream* stream, double* p, int sx);
-uint zfp_decode_partial_block_strided_int32_1(zfp_stream* stream, int32* p, uint nx, int sx);
-uint zfp_decode_partial_block_strided_int64_1(zfp_stream* stream, int64* p, uint nx, int sx);
-uint zfp_decode_partial_block_strided_float_1(zfp_stream* stream, float* p, uint nx, int sx);
-uint zfp_decode_partial_block_strided_double_1(zfp_stream* stream, double* p, uint nx, int sx);
+size_t zfp_decode_block_strided_int32_1(zfp_stream* stream, int32* p, ptrdiff_t sx);
+size_t zfp_decode_block_strided_int64_1(zfp_stream* stream, int64* p, ptrdiff_t sx);
+size_t zfp_decode_block_strided_float_1(zfp_stream* stream, float* p, ptrdiff_t sx);
+size_t zfp_decode_block_strided_double_1(zfp_stream* stream, double* p, ptrdiff_t sx);
+size_t zfp_decode_partial_block_strided_int32_1(zfp_stream* stream, int32* p, size_t nx, ptrdiff_t sx);
+size_t zfp_decode_partial_block_strided_int64_1(zfp_stream* stream, int64* p, size_t nx, ptrdiff_t sx);
+size_t zfp_decode_partial_block_strided_float_1(zfp_stream* stream, float* p, size_t nx, ptrdiff_t sx);
+size_t zfp_decode_partial_block_strided_double_1(zfp_stream* stream, double* p, size_t nx, ptrdiff_t sx);
 
 /* decode 2D contiguous block of 4x4 values */
-uint zfp_decode_block_int32_2(zfp_stream* stream, int32* block);
-uint zfp_decode_block_int64_2(zfp_stream* stream, int64* block);
-uint zfp_decode_block_float_2(zfp_stream* stream, float* block);
-uint zfp_decode_block_double_2(zfp_stream* stream, double* block);
+size_t zfp_decode_block_int32_2(zfp_stream* stream, int32* block);
+size_t zfp_decode_block_int64_2(zfp_stream* stream, int64* block);
+size_t zfp_decode_block_float_2(zfp_stream* stream, float* block);
+size_t zfp_decode_block_double_2(zfp_stream* stream, double* block);
 
 /* decode 2D complete or partial block from strided array */
-uint zfp_decode_block_strided_int32_2(zfp_stream* stream, int32* p, int sx, int sy);
-uint zfp_decode_block_strided_int64_2(zfp_stream* stream, int64* p, int sx, int sy);
-uint zfp_decode_block_strided_float_2(zfp_stream* stream, float* p, int sx, int sy);
-uint zfp_decode_block_strided_double_2(zfp_stream* stream, double* p, int sx, int sy);
-uint zfp_decode_partial_block_strided_int32_2(zfp_stream* stream, int32* p, uint nx, uint ny, int sx, int sy);
-uint zfp_decode_partial_block_strided_int64_2(zfp_stream* stream, int64* p, uint nx, uint ny, int sx, int sy);
-uint zfp_decode_partial_block_strided_float_2(zfp_stream* stream, float* p, uint nx, uint ny, int sx, int sy);
-uint zfp_decode_partial_block_strided_double_2(zfp_stream* stream, double* p, uint nx, uint ny, int sx, int sy);
+size_t zfp_decode_block_strided_int32_2(zfp_stream* stream, int32* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_block_strided_int64_2(zfp_stream* stream, int64* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_block_strided_float_2(zfp_stream* stream, float* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_block_strided_double_2(zfp_stream* stream, double* p, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_partial_block_strided_int32_2(zfp_stream* stream, int32* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_partial_block_strided_int64_2(zfp_stream* stream, int64* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_partial_block_strided_float_2(zfp_stream* stream, float* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+size_t zfp_decode_partial_block_strided_double_2(zfp_stream* stream, double* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
 
 /* decode 3D contiguous block of 4x4x4 values */
-uint zfp_decode_block_int32_3(zfp_stream* stream, int32* block);
-uint zfp_decode_block_int64_3(zfp_stream* stream, int64* block);
-uint zfp_decode_block_float_3(zfp_stream* stream, float* block);
-uint zfp_decode_block_double_3(zfp_stream* stream, double* block);
+size_t zfp_decode_block_int32_3(zfp_stream* stream, int32* block);
+size_t zfp_decode_block_int64_3(zfp_stream* stream, int64* block);
+size_t zfp_decode_block_float_3(zfp_stream* stream, float* block);
+size_t zfp_decode_block_double_3(zfp_stream* stream, double* block);
 
 /* decode 3D complete or partial block from strided array */
-uint zfp_decode_block_strided_int32_3(zfp_stream* stream, int32* p, int sx, int sy, int sz);
-uint zfp_decode_block_strided_int64_3(zfp_stream* stream, int64* p, int sx, int sy, int sz);
-uint zfp_decode_block_strided_float_3(zfp_stream* stream, float* p, int sx, int sy, int sz);
-uint zfp_decode_block_strided_double_3(zfp_stream* stream, double* p, int sx, int sy, int sz);
-uint zfp_decode_partial_block_strided_int32_3(zfp_stream* stream, int32* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
-uint zfp_decode_partial_block_strided_int64_3(zfp_stream* stream, int64* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
-uint zfp_decode_partial_block_strided_float_3(zfp_stream* stream, float* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
-uint zfp_decode_partial_block_strided_double_3(zfp_stream* stream, double* p, uint nx, uint ny, uint nz, int sx, int sy, int sz);
+size_t zfp_decode_block_strided_int32_3(zfp_stream* stream, int32* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_block_strided_int64_3(zfp_stream* stream, int64* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_block_strided_float_3(zfp_stream* stream, float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_block_strided_double_3(zfp_stream* stream, double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_partial_block_strided_int32_3(zfp_stream* stream, int32* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_partial_block_strided_int64_3(zfp_stream* stream, int64* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_partial_block_strided_float_3(zfp_stream* stream, float* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+size_t zfp_decode_partial_block_strided_double_3(zfp_stream* stream, double* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
 
 /* decode 4D contiguous block of 4x4x4x4 values */
-uint zfp_decode_block_int32_4(zfp_stream* stream, int32* block);
-uint zfp_decode_block_int64_4(zfp_stream* stream, int64* block);
-uint zfp_decode_block_float_4(zfp_stream* stream, float* block);
-uint zfp_decode_block_double_4(zfp_stream* stream, double* block);
+size_t zfp_decode_block_int32_4(zfp_stream* stream, int32* block);
+size_t zfp_decode_block_int64_4(zfp_stream* stream, int64* block);
+size_t zfp_decode_block_float_4(zfp_stream* stream, float* block);
+size_t zfp_decode_block_double_4(zfp_stream* stream, double* block);
 
 /* decode 4D complete or partial block from strided array */
-uint zfp_decode_block_strided_int32_4(zfp_stream* stream, int32* p, int sx, int sy, int sz, int sw);
-uint zfp_decode_block_strided_int64_4(zfp_stream* stream, int64* p, int sx, int sy, int sz, int sw);
-uint zfp_decode_block_strided_float_4(zfp_stream* stream, float* p, int sx, int sy, int sz, int sw);
-uint zfp_decode_block_strided_double_4(zfp_stream* stream, double* p, int sx, int sy, int sz, int sw);
-uint zfp_decode_partial_block_strided_int32_4(zfp_stream* stream, int32* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
-uint zfp_decode_partial_block_strided_int64_4(zfp_stream* stream, int64* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
-uint zfp_decode_partial_block_strided_float_4(zfp_stream* stream, float* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
-uint zfp_decode_partial_block_strided_double_4(zfp_stream* stream, double* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw);
+size_t zfp_decode_block_strided_int32_4(zfp_stream* stream, int32* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_block_strided_int64_4(zfp_stream* stream, int64* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_block_strided_float_4(zfp_stream* stream, float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_block_strided_double_4(zfp_stream* stream, double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_partial_block_strided_int32_4(zfp_stream* stream, int32* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_partial_block_strided_int64_4(zfp_stream* stream, int64* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_partial_block_strided_float_4(zfp_stream* stream, float* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+size_t zfp_decode_partial_block_strided_double_4(zfp_stream* stream, double* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
 
 /* low-level API: utility functions ---------------------------------------- */
 
diff --git a/include/zfp.hpp b/include/zfp.hpp
new file mode 100644
index 00000000..406fec9b
--- /dev/null
+++ b/include/zfp.hpp
@@ -0,0 +1,289 @@
+#ifndef ZFP_HPP
+#define ZFP_HPP
+
+// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC and
+// other zfp project contributors. See the top-level LICENSE file for details.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "zfp.h"
+
+// templated C++ wrappers around libzfp low-level C functions
+namespace zfp {
+
+// encoder declarations -------------------------------------------------------
+
+template <typename Scalar, uint dims>
+inline size_t
+encode_block(zfp_stream* zfp, const Scalar* block);
+
+template <typename Scalar>
+inline size_t
+encode_block_strided(zfp_stream* zfp, const Scalar* p, ptrdiff_t sx);
+
+template <typename Scalar>
+inline size_t
+encode_block_strided(zfp_stream* zfp, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy);
+
+template <typename Scalar>
+inline size_t
+encode_block_strided(zfp_stream* zfp, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+
+template <typename Scalar>
+inline size_t
+encode_block_strided(zfp_stream* zfp, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+
+template <typename Scalar>
+inline size_t
+encode_partial_block_strided(zfp_stream* zfp, const Scalar* p, size_t nx, ptrdiff_t sx);
+
+template <typename Scalar>
+inline size_t
+encode_partial_block_strided(zfp_stream* zfp, const Scalar* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+
+template <typename Scalar>
+inline size_t
+encode_partial_block_strided(zfp_stream* zfp, const Scalar* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+
+template <typename Scalar>
+inline size_t
+encode_partial_block_strided(zfp_stream* zfp, const Scalar* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+
+// encoder specializations ----------------------------------------------------
+
+template<>
+inline size_t
+encode_block<float, 1>(zfp_stream* zfp, const float* block) { return zfp_encode_block_float_1(zfp, block); }
+
+template<>
+inline size_t
+encode_block<float, 2>(zfp_stream* zfp, const float* block) { return zfp_encode_block_float_2(zfp, block); }
+
+template<>
+inline size_t
+encode_block<float, 3>(zfp_stream* zfp, const float* block) { return zfp_encode_block_float_3(zfp, block); }
+
+template<>
+inline size_t
+encode_block<float, 4>(zfp_stream* zfp, const float* block) { return zfp_encode_block_float_4(zfp, block); }
+
+template<>
+inline size_t
+encode_block<double, 1>(zfp_stream* zfp, const double* block) { return zfp_encode_block_double_1(zfp, block); }
+
+template<>
+inline size_t
+encode_block<double, 2>(zfp_stream* zfp, const double* block) { return zfp_encode_block_double_2(zfp, block); }
+
+template<>
+inline size_t
+encode_block<double, 3>(zfp_stream* zfp, const double* block) { return zfp_encode_block_double_3(zfp, block); }
+
+template<>
+inline size_t
+encode_block<double, 4>(zfp_stream* zfp, const double* block) { return zfp_encode_block_double_4(zfp, block); }
+
+template <>
+inline size_t
+encode_block_strided<float>(zfp_stream* zfp, const float* p, ptrdiff_t sx) { return zfp_encode_block_strided_float_1(zfp, p, sx); }
+
+template <>
+inline size_t
+encode_block_strided<float>(zfp_stream* zfp, const float* p, ptrdiff_t sx, ptrdiff_t sy) { return zfp_encode_block_strided_float_2(zfp, p, sx, sy); }
+
+template <>
+inline size_t
+encode_block_strided<float>(zfp_stream* zfp, const float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_encode_block_strided_float_3(zfp, p, sx, sy, sz); }
+
+template <>
+inline size_t
+encode_block_strided<float>(zfp_stream* zfp, const float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_encode_block_strided_float_4(zfp, p, sx, sy, sz, sw); }
+
+template <>
+inline size_t
+encode_block_strided<double>(zfp_stream* zfp, const double* p, ptrdiff_t sx) { return zfp_encode_block_strided_double_1(zfp, p, sx); }
+
+template <>
+inline size_t
+encode_block_strided<double>(zfp_stream* zfp, const double* p, ptrdiff_t sx, ptrdiff_t sy) { return zfp_encode_block_strided_double_2(zfp, p, sx, sy); }
+
+template <>
+inline size_t
+encode_block_strided<double>(zfp_stream* zfp, const double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_encode_block_strided_double_3(zfp, p, sx, sy, sz); }
+
+template <>
+inline size_t
+encode_block_strided<double>(zfp_stream* zfp, const double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_encode_block_strided_double_4(zfp, p, sx, sy, sz, sw); }
+
+template <>
+inline size_t
+encode_partial_block_strided<float>(zfp_stream* zfp, const float* p, size_t nx, ptrdiff_t sx)
+{ return zfp_encode_partial_block_strided_float_1(zfp, p, nx, sx); }
+
+template <>
+inline size_t
+encode_partial_block_strided<float>(zfp_stream* zfp, const float* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy) { return zfp_encode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy); }
+
+template <>
+inline size_t
+encode_partial_block_strided<float>(zfp_stream* zfp, const float* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_encode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz); }
+
+template <>
+inline size_t
+encode_partial_block_strided<float>(zfp_stream* zfp, const float* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_encode_partial_block_strided_float_4(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw); }
+
+template <>
+inline size_t
+encode_partial_block_strided<double>(zfp_stream* zfp, const double* p, size_t nx, ptrdiff_t sx)
+{ return zfp_encode_partial_block_strided_double_1(zfp, p, nx, sx); }
+
+template <>
+inline size_t
+encode_partial_block_strided<double>(zfp_stream* zfp, const double* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy) { return zfp_encode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy); }
+
+template <>
+inline size_t
+encode_partial_block_strided<double>(zfp_stream* zfp, const double* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_encode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz); }
+
+template <>
+inline size_t
+encode_partial_block_strided<double>(zfp_stream* zfp, const double* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_encode_partial_block_strided_double_4(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw); }
+
+// decoder declarations -------------------------------------------------------
+
+template <typename Scalar, uint dims>
+inline size_t
+decode_block(zfp_stream* zfp, Scalar* block);
+
+template <typename Scalar>
+inline size_t
+decode_block_strided(zfp_stream* zfp, Scalar* p, ptrdiff_t sx);
+
+template <typename Scalar>
+inline size_t
+decode_block_strided(zfp_stream* zfp, Scalar* p, ptrdiff_t sx, ptrdiff_t sy);
+
+template <typename Scalar>
+inline size_t
+decode_block_strided(zfp_stream* zfp, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+
+template <typename Scalar>
+inline size_t
+decode_block_strided(zfp_stream* zfp, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+
+template <typename Scalar>
+inline size_t
+decode_partial_block_strided(zfp_stream* zfp, Scalar* p, size_t nx, ptrdiff_t sx);
+
+template <typename Scalar>
+inline size_t
+decode_partial_block_strided(zfp_stream* zfp, Scalar* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy);
+
+template <typename Scalar>
+inline size_t
+decode_partial_block_strided(zfp_stream* zfp, Scalar* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz);
+
+template <typename Scalar>
+inline size_t
+decode_partial_block_strided(zfp_stream* zfp, Scalar* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw);
+
+// decoder specializations ----------------------------------------------------
+
+template<>
+inline size_t
+decode_block<float, 1>(zfp_stream* zfp, float* block) { return zfp_decode_block_float_1(zfp, block); }
+
+template<>
+inline size_t
+decode_block<float, 2>(zfp_stream* zfp, float* block) { return zfp_decode_block_float_2(zfp, block); }
+
+template<>
+inline size_t
+decode_block<float, 3>(zfp_stream* zfp, float* block) { return zfp_decode_block_float_3(zfp, block); }
+
+template<>
+inline size_t
+decode_block<float, 4>(zfp_stream* zfp, float* block) { return zfp_decode_block_float_4(zfp, block); }
+
+template<>
+inline size_t
+decode_block<double, 1>(zfp_stream* zfp, double* block) { return zfp_decode_block_double_1(zfp, block); }
+
+template<>
+inline size_t
+decode_block<double, 2>(zfp_stream* zfp, double* block) { return zfp_decode_block_double_2(zfp, block); }
+
+template<>
+inline size_t
+decode_block<double, 3>(zfp_stream* zfp, double* block) { return zfp_decode_block_double_3(zfp, block); }
+
+template<>
+inline size_t
+decode_block<double, 4>(zfp_stream* zfp, double* block) { return zfp_decode_block_double_4(zfp, block); }
+
+template <>
+inline size_t
+decode_block_strided<float>(zfp_stream* zfp, float* p, ptrdiff_t sx) { return zfp_decode_block_strided_float_1(zfp, p, sx); }
+
+template <>
+inline size_t
+decode_block_strided<float>(zfp_stream* zfp, float* p, ptrdiff_t sx, ptrdiff_t sy) { return zfp_decode_block_strided_float_2(zfp, p, sx, sy); }
+
+template <>
+inline size_t
+decode_block_strided<float>(zfp_stream* zfp, float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_decode_block_strided_float_3(zfp, p, sx, sy, sz); }
+
+template <>
+inline size_t
+decode_block_strided<float>(zfp_stream* zfp, float* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_decode_block_strided_float_4(zfp, p, sx, sy, sz, sw); }
+
+template <>
+inline size_t
+decode_block_strided<double>(zfp_stream* zfp, double* p, ptrdiff_t sx) { return zfp_decode_block_strided_double_1(zfp, p, sx); }
+
+template <>
+inline size_t
+decode_block_strided<double>(zfp_stream* zfp, double* p, ptrdiff_t sx, ptrdiff_t sy) { return zfp_decode_block_strided_double_2(zfp, p, sx, sy); }
+
+template <>
+inline size_t
+decode_block_strided<double>(zfp_stream* zfp, double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_decode_block_strided_double_3(zfp, p, sx, sy, sz); }
+
+template <>
+inline size_t
+decode_block_strided<double>(zfp_stream* zfp, double* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_decode_block_strided_double_4(zfp, p, sx, sy, sz, sw); }
+
+template <>
+inline size_t
+decode_partial_block_strided<float>(zfp_stream* zfp, float* p, size_t nx, ptrdiff_t sx) { return zfp_decode_partial_block_strided_float_1(zfp, p, nx, sx); }
+
+template <>
+inline size_t
+decode_partial_block_strided<float>(zfp_stream* zfp, float* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy) { return zfp_decode_partial_block_strided_float_2(zfp, p, nx, ny, sx, sy); }
+
+template <>
+inline size_t
+decode_partial_block_strided<float>(zfp_stream* zfp, float* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_decode_partial_block_strided_float_3(zfp, p, nx, ny, nz, sx, sy, sz); }
+
+template <>
+inline size_t
+decode_partial_block_strided<float>(zfp_stream* zfp, float* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_decode_partial_block_strided_float_4(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw); }
+
+template <>
+inline size_t
+decode_partial_block_strided<double>(zfp_stream* zfp, double* p, size_t nx, ptrdiff_t sx) { return zfp_decode_partial_block_strided_double_1(zfp, p, nx, sx); }
+
+template <>
+inline size_t
+decode_partial_block_strided<double>(zfp_stream* zfp, double* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy) { return zfp_decode_partial_block_strided_double_2(zfp, p, nx, ny, sx, sy); }
+
+template <>
+inline size_t
+decode_partial_block_strided<double>(zfp_stream* zfp, double* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) { return zfp_decode_partial_block_strided_double_3(zfp, p, nx, ny, nz, sx, sy, sz); }
+
+template <>
+inline size_t
+decode_partial_block_strided<double>(zfp_stream* zfp, double* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) { return zfp_decode_partial_block_strided_double_4(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw); }
+
+}
+
+#endif
diff --git a/include/zfp/array.h b/include/zfp/array.h
new file mode 100644
index 00000000..b503abc4
--- /dev/null
+++ b/include/zfp/array.h
@@ -0,0 +1,32 @@
+#ifndef CFP_ARRAY_H
+#define CFP_ARRAY_H
+
+#include <stddef.h>
+#include "zfp/internal/cfp/header.h"
+#include "zfp/internal/cfp/array1f.h"
+#include "zfp/internal/cfp/array1d.h"
+#include "zfp/internal/cfp/array2f.h"
+#include "zfp/internal/cfp/array2d.h"
+#include "zfp/internal/cfp/array3f.h"
+#include "zfp/internal/cfp/array3d.h"
+#include "zfp/internal/cfp/array4f.h"
+#include "zfp/internal/cfp/array4d.h"
+
+typedef struct {
+  cfp_array1f_api array1f;
+  cfp_array1d_api array1d;
+  cfp_array2f_api array2f;
+  cfp_array2d_api array2d;
+  cfp_array3f_api array3f;
+  cfp_array3d_api array3d;
+  cfp_array4f_api array4f;
+  cfp_array4d_api array4d;
+} cfp_api;
+
+#ifndef CFP_NAMESPACE
+  #define CFP_NAMESPACE cfp
+#endif
+
+extern_ const cfp_api CFP_NAMESPACE;
+
+#endif
diff --git a/include/zfp/array.hpp b/include/zfp/array.hpp
new file mode 100644
index 00000000..07d5b08b
--- /dev/null
+++ b/include/zfp/array.hpp
@@ -0,0 +1,95 @@
+#ifndef ZFP_ARRAY_HPP
+#define ZFP_ARRAY_HPP
+
+#include <algorithm>
+#include <climits>
+#include <string>
+#include "zfp.h"
+#include "zfp/internal/array/exception.hpp"
+
+namespace zfp {
+
+// abstract base class for compressed array of scalars
+class array {
+public:
+  #include "zfp/internal/array/header.hpp"
+
+  // factory function (see zfpfactory.h)
+  static zfp::array* construct(const zfp::array::header& header, const void* buffer = 0, size_t buffer_size_bytes = 0);
+
+  // public virtual destructor (can delete array through base class pointer)
+  virtual ~array() {}
+
+  // underlying scalar type
+  zfp_type scalar_type() const { return type; }
+
+  // dimensionality
+  uint dimensionality() const { return dims; }
+
+  // rate in bits per value
+  virtual double rate() const = 0;
+
+  // compressed data size and buffer
+  virtual size_t compressed_size() const = 0;
+  virtual void* compressed_data() const = 0;
+
+protected:
+  // default constructor
+  array() :
+    type(zfp_type_none),
+    dims(0),
+    nx(0), ny(0), nz(0), nw(0)
+  {}
+
+  // generic array with 'dims' dimensions and scalar type 'type'
+  explicit array(uint dims, zfp_type type) :
+    type(type),
+    dims(dims),
+    nx(0), ny(0), nz(0), nw(0)
+  {}
+
+  // constructor from previously-serialized compressed array
+  explicit array(uint dims, zfp_type type, const zfp::array::header& header) :
+    type(type),
+    dims(dims),
+    nx(header.size_x()), ny(header.size_y()), nz(header.size_z()), nw(header.size_w())
+  {
+    if (header.scalar_type() != type)
+      throw zfp::exception("zfp array scalar type does not match header");
+    if (header.dimensionality() != dims)
+      throw zfp::exception("zfp array dimensionality does not match header");
+  }
+
+  // copy constructor--performs a deep copy
+  array(const array& a)
+  {
+    deep_copy(a);
+  }
+
+  // assignment operator--performs a deep copy
+  array& operator=(const array& a)
+  {
+    deep_copy(a);
+    return *this;
+  }
+
+  // perform a deep copy
+  void deep_copy(const array& a)
+  {
+    // copy metadata
+    type = a.type;
+    dims = a.dims;
+    nx = a.nx;
+    ny = a.ny;
+    nz = a.nz;
+    nw = a.nw;
+  }
+
+  zfp_type type;         // scalar type
+  uint dims;             // array dimensionality (1, 2, 3, or 4)
+  size_t nx, ny, nz, nw; // array dimensions
+};
+
+}
+
+#endif
diff --git a/include/zfp/array1.hpp b/include/zfp/array1.hpp
new file mode 100644
index 00000000..6b89fefa
--- /dev/null
+++ b/include/zfp/array1.hpp
@@ -0,0 +1,265 @@
+#ifndef ZFP_ARRAY1_HPP
+#define ZFP_ARRAY1_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache1.hpp"
+#include "zfp/internal/array/handle1.hpp"
+#include "zfp/internal/array/iterator1.hpp"
+#include "zfp/internal/array/pointer1.hpp"
+#include "zfp/internal/array/reference1.hpp"
+#include "zfp/internal/array/store1.hpp"
+#include "zfp/internal/array/view1.hpp"
+
+namespace zfp {
+
+// compressed 2D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp1<Scalar>,
+  class Index = zfp::index::implicit
+>
+class array1 : public array {
+public:
+  // types utilized by nested classes
+  typedef array1 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore1<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache1<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim1::const_reference<array1> const_reference;
+  typedef zfp::internal::dim1::const_pointer<array1> const_pointer;
+  typedef zfp::internal::dim1::const_iterator<array1> const_iterator;
+  typedef zfp::internal::dim1::const_view<array1> const_view;
+  typedef zfp::internal::dim1::private_const_view<array1> private_const_view;
+  typedef zfp::internal::dim1::reference<array1> reference;
+  typedef zfp::internal::dim1::pointer<array1> pointer;
+  typedef zfp::internal::dim1::iterator<array1> iterator;
+  typedef zfp::internal::dim1::view<array1> view;
+  typedef zfp::internal::dim1::private_view<array1> private_view;
+
+  // default constructor
+  array1() :
+    array(1, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx-element array using rate bits per value, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  array1(size_t nx, double rate, const value_type* p = 0, size_t cache_size = 0) :
+    array(1, Codec::type),
+    store(nx, zfp_config_rate(rate, true)),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    if (p)
+      set(p);
+  }
+
+  // constructor, from previously-serialized compressed array
+  array1(const zfp::array::header& header, const void* buffer = 0, size_t buffer_size_bytes = 0) :
+    array(1, Codec::type, header),
+    store(header.size_x(), zfp_config_rate(header.rate(), true)),
+    cache(store)
+  {
+    if (buffer) {
+      if (buffer_size_bytes && buffer_size_bytes < store.compressed_size())
+        throw zfp::exception("buffer size is smaller than required");
+      std::memcpy(store.compressed_data(), buffer, store.compressed_size());
+    }
+  }
+
+  // copy constructor--performs a deep copy
+  array1(const array1& a) :
+    array(),
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array1(const View& v) :
+    array(1, Codec::type),
+    store(v.size_x(), zfp_config_rate(v.rate(), true)),
+    cache(store)
+  {
+    this->nx = v.size_x();
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i());
+  }
+
+  // virtual destructor
+  virtual ~array1() {}
+
+  // assignment operator--performs a deep copy
+  array1& operator=(const array1& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    store.resize(nx, clear);
+  }
+
+  // rate in bits per value
+  double rate() const { return store.rate(); }
+
+  // set rate in bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, true);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const ptrdiff_t sx = 1;
+    size_t block_index = 0;
+    for (size_t i = 0; i < bx; i++, p += 4)
+      cache.get_block(block_index++, p, sx);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p)
+  {
+    const size_t bx = store.block_size_x();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      for (size_t i = 0; i < bx; i++, p += 4)
+        cache.put_block(block_index++, p, sx);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4] = {};
+      while (block_index < bx)
+        cache.put_block(block_index++, block, 1);
+    }
+  }
+
+  // accessors
+  const_reference operator()(size_t i) const { return const_reference(const_cast<container_type*>(this), i); }
+  reference operator()(size_t i) { return reference(this, i); }
+
+  // flat index accessors
+  const_reference operator[](size_t index) const { return const_reference(const_cast<container_type*>(this), index); }
+  reference operator[](size_t index) { return reference(this, index); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0); }
+  const_iterator cend() const { return const_iterator(this, nx); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, 0); }
+  iterator end() { return iterator(this, nx); }
+
+protected:
+  friend class zfp::internal::dim1::const_handle<array1>;
+  friend class zfp::internal::dim1::const_reference<array1>;
+  friend class zfp::internal::dim1::const_pointer<array1>;
+  friend class zfp::internal::dim1::const_iterator<array1>;
+  friend class zfp::internal::dim1::const_view<array1>;
+  friend class zfp::internal::dim1::private_const_view<array1>;
+  friend class zfp::internal::dim1::reference<array1>;
+  friend class zfp::internal::dim1::pointer<array1>;
+  friend class zfp::internal::dim1::iterator<array1>;
+  friend class zfp::internal::dim1::view<array1>;
+  friend class zfp::internal::dim1::private_view<array1>;
+
+  // perform a deep copy
+  void deep_copy(const array1& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+
+  // inspector
+  value_type get(size_t i) const { return cache.get(i); }
+
+  // mutators (called from proxy reference)
+  void set(size_t i, value_type val) { cache.set(i, val); }
+  void add(size_t i, value_type val) { cache.ref(i) += val; }
+  void sub(size_t i, value_type val) { cache.ref(i) -= val; }
+  void mul(size_t i, value_type val) { cache.ref(i) *= val; }
+  void div(size_t i, value_type val) { cache.ref(i) /= val; }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef array1<float> array1f;
+typedef array1<double> array1d;
+
+}
+
+#endif
diff --git a/include/zfp/array2.hpp b/include/zfp/array2.hpp
new file mode 100644
index 00000000..d669f7c0
--- /dev/null
+++ b/include/zfp/array2.hpp
@@ -0,0 +1,301 @@
+#ifndef ZFP_ARRAY2_HPP
+#define ZFP_ARRAY2_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache2.hpp"
+#include "zfp/internal/array/handle2.hpp"
+#include "zfp/internal/array/iterator2.hpp"
+#include "zfp/internal/array/pointer2.hpp"
+#include "zfp/internal/array/reference2.hpp"
+#include "zfp/internal/array/store2.hpp"
+#include "zfp/internal/array/view2.hpp"
+
+namespace zfp {
+
+// compressed 2D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp2<Scalar>,
+  class Index = zfp::index::implicit
+>
+class array2 : public array {
+public:
+  // types utilized by nested classes
+  typedef array2 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore2<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache2<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim2::const_reference<array2> const_reference;
+  typedef zfp::internal::dim2::const_pointer<array2> const_pointer;
+  typedef zfp::internal::dim2::const_iterator<array2> const_iterator;
+  typedef zfp::internal::dim2::const_view<array2> const_view;
+  typedef zfp::internal::dim2::private_const_view<array2> private_const_view;
+  typedef zfp::internal::dim2::reference<array2> reference;
+  typedef zfp::internal::dim2::pointer<array2> pointer;
+  typedef zfp::internal::dim2::iterator<array2> iterator;
+  typedef zfp::internal::dim2::view<array2> view;
+  typedef zfp::internal::dim2::flat_view<array2> flat_view;
+  typedef zfp::internal::dim2::nested_view1<array2> nested_view1;
+  typedef zfp::internal::dim2::nested_view2<array2> nested_view2;
+  typedef zfp::internal::dim2::nested_view2<array2> nested_view;
+  typedef zfp::internal::dim2::private_view<array2> private_view;
+
+  // default constructor
+  array2() :
+    array(2, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx * ny array using rate bits per value, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  array2(size_t nx, size_t ny, double rate, const value_type* p = 0, size_t cache_size = 0) :
+    array(2, Codec::type),
+    store(nx, ny, zfp_config_rate(rate, true)),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    this->ny = ny;
+    if (p)
+      set(p);
+  }
+
+  // constructor, from previously-serialized compressed array
+  array2(const zfp::array::header& header, const void* buffer = 0, size_t buffer_size_bytes = 0) :
+    array(2, Codec::type, header),
+    store(header.size_x(), header.size_y(), zfp_config_rate(header.rate(), true)),
+    cache(store)
+  {
+    if (buffer) {
+      if (buffer_size_bytes && buffer_size_bytes < store.compressed_size())
+        throw zfp::exception("buffer size is smaller than required");
+      std::memcpy(store.compressed_data(), buffer, store.compressed_size());
+    }
+  }
+
+  // copy constructor--performs a deep copy
+  array2(const array2& a) :
+    array(),
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array2(const View& v) :
+    array(2, Codec::type),
+    store(v.size_x(), v.size_y(), zfp_config_rate(v.rate(), true)),
+    cache(store)
+  {
+    this->nx = v.size_x();
+    this->ny = v.size_y();
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i(), it.j());
+  }
+
+  // virtual destructor
+  virtual ~array2() {}
+
+  // assignment operator--performs a deep copy
+  array2& operator=(const array2& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx * ny; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, size_t ny, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    this->ny = ny;
+    store.resize(nx, ny, clear);
+  }
+
+  // rate in bits per value
+  double rate() const { return store.rate(); }
+
+  // set rate in bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, true);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const ptrdiff_t sx = 1;
+    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+    size_t block_index = 0;
+    for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+      for (size_t i = 0; i < bx; i++, p += 4)
+        cache.get_block(block_index++, p, sx, sy);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p)
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+        for (size_t i = 0; i < bx; i++, p += 4)
+          cache.put_block(block_index++, p, sx, sy);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4] = {};
+      while (block_index < bx * by)
+        cache.put_block(block_index++, block, 1, 4);
+    }
+  }
+
+  // (i, j) accessors
+  const_reference operator()(size_t i, size_t j) const { return const_reference(const_cast<container_type*>(this), i, j); }
+  reference operator()(size_t i, size_t j) { return reference(this, i, j); }
+
+  // flat index accessors
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j;
+    ij(i, j, index);
+    return const_reference(const_cast<container_type*>(this), i, j);
+  }
+  reference operator[](size_t index)
+  {
+    size_t i, j;
+    ij(i, j, index);
+    return reference(this, i, j);
+  }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0, 0); }
+  const_iterator cend() const { return const_iterator(this, 0, ny); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, 0, 0); }
+  iterator end() { return iterator(this, 0, ny); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<array2>;
+  friend class zfp::internal::dim2::const_reference<array2>;
+  friend class zfp::internal::dim2::const_pointer<array2>;
+  friend class zfp::internal::dim2::const_iterator<array2>;
+  friend class zfp::internal::dim2::const_view<array2>;
+  friend class zfp::internal::dim2::private_const_view<array2>;
+  friend class zfp::internal::dim2::reference<array2>;
+  friend class zfp::internal::dim2::pointer<array2>;
+  friend class zfp::internal::dim2::iterator<array2>;
+  friend class zfp::internal::dim2::view<array2>;
+  friend class zfp::internal::dim2::flat_view<array2>;
+  friend class zfp::internal::dim2::nested_view1<array2>;
+  friend class zfp::internal::dim2::nested_view2<array2>;
+  friend class zfp::internal::dim2::private_view<array2>;
+
+  // perform a deep copy
+  void deep_copy(const array2& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+  size_t min_y() const { return 0; }
+  size_t max_y() const { return ny; }
+
+  // inspector
+  value_type get(size_t i, size_t j) const { return cache.get(i, j); }
+
+  // mutators (called from proxy reference)
+  void set(size_t i, size_t j, value_type val) { cache.set(i, j, val); }
+  void add(size_t i, size_t j, value_type val) { cache.ref(i, j) += val; }
+  void sub(size_t i, size_t j, value_type val) { cache.ref(i, j) -= val; }
+  void mul(size_t i, size_t j, value_type val) { cache.ref(i, j) *= val; }
+  void div(size_t i, size_t j, value_type val) { cache.ref(i, j) /= val; }
+
+  // convert flat index to (i, j)
+  void ij(size_t& i, size_t& j, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index;
+  }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef array2<float> array2f;
+typedef array2<double> array2d;
+
+}
+
+#endif
diff --git a/include/zfp/array3.hpp b/include/zfp/array3.hpp
new file mode 100644
index 00000000..7e60fade
--- /dev/null
+++ b/include/zfp/array3.hpp
@@ -0,0 +1,316 @@
+#ifndef ZFP_ARRAY3_HPP
+#define ZFP_ARRAY3_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache3.hpp"
+#include "zfp/internal/array/handle3.hpp"
+#include "zfp/internal/array/iterator3.hpp"
+#include "zfp/internal/array/pointer3.hpp"
+#include "zfp/internal/array/reference3.hpp"
+#include "zfp/internal/array/store3.hpp"
+#include "zfp/internal/array/view3.hpp"
+
+namespace zfp {
+
+// compressed 3D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp3<Scalar>,
+  class Index = zfp::index::implicit
+>
+class array3 : public array {
+public:
+  // types utilized by nested classes
+  typedef array3 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore3<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache3<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim3::const_reference<array3> const_reference;
+  typedef zfp::internal::dim3::const_pointer<array3> const_pointer;
+  typedef zfp::internal::dim3::const_iterator<array3> const_iterator;
+  typedef zfp::internal::dim3::const_view<array3> const_view;
+  typedef zfp::internal::dim3::private_const_view<array3> private_const_view;
+  typedef zfp::internal::dim3::reference<array3> reference;
+  typedef zfp::internal::dim3::pointer<array3> pointer;
+  typedef zfp::internal::dim3::iterator<array3> iterator;
+  typedef zfp::internal::dim3::view<array3> view;
+  typedef zfp::internal::dim3::flat_view<array3> flat_view;
+  typedef zfp::internal::dim3::nested_view1<array3> nested_view1;
+  typedef zfp::internal::dim3::nested_view2<array3> nested_view2;
+  typedef zfp::internal::dim3::nested_view2<array3> nested_view3;
+  typedef zfp::internal::dim3::nested_view3<array3> nested_view;
+  typedef zfp::internal::dim3::private_view<array3> private_view;
+
+  // default constructor
+  array3() :
+    array(3, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx * ny * nz array using rate bits per value, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  array3(size_t nx, size_t ny, size_t nz, double rate, const value_type* p = 0, size_t cache_size = 0) :
+    array(3, Codec::type),
+    store(nx, ny, nz, zfp_config_rate(rate, true)),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    if (p)
+      set(p);
+  }
+
+  // constructor, from previously-serialized compressed array
+  array3(const zfp::array::header& header, const void* buffer = 0, size_t buffer_size_bytes = 0) :
+    array(3, Codec::type, header),
+    store(header.size_x(), header.size_y(), header.size_z(), zfp_config_rate(header.rate(), true)),
+    cache(store)
+  {
+    if (buffer) {
+      if (buffer_size_bytes && buffer_size_bytes < store.compressed_size())
+        throw zfp::exception("buffer size is smaller than required");
+      std::memcpy(store.compressed_data(), buffer, store.compressed_size());
+    }
+  }
+
+  // copy constructor--performs a deep copy
+  array3(const array3& a) :
+    array(),
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array3(const View& v) :
+    array(3, Codec::type),
+    store(v.size_x(), v.size_y(), v.size_z(), zfp_config_rate(v.rate(), true)),
+    cache(store)
+  {
+    this->nx = v.size_x();
+    this->ny = v.size_y();
+    this->nz = v.size_z();
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i(), it.j(), it.k());
+  }
+
+  // virtual destructor
+  virtual ~array3() {}
+
+  // assignment operator--performs a deep copy
+  array3& operator=(const array3& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx * ny * nz; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, size_t ny, size_t nz, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    store.resize(nx, ny, nz, clear);
+  }
+
+  // rate in bits per value
+  double rate() const { return store.rate(); }
+
+  // set rate in bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, true);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    const ptrdiff_t sx = 1;
+    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+    const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+    size_t block_index = 0;
+    for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
+      for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+        for (size_t i = 0; i < bx; i++, p += 4)
+          cache.get_block(block_index++, p, sx, sy, sz);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p)
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+      for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
+        for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+          for (size_t i = 0; i < bx; i++, p += 4)
+            cache.put_block(block_index++, p, sx, sy, sz);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4 * 4] = {};
+      while (block_index < bx * by * bz)
+        cache.put_block(block_index++, block, 1, 4, 16);
+    }
+  }
+
+  // (i, j, k) accessors
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(const_cast<container_type*>(this), i, j, k); }
+  reference operator()(size_t i, size_t j, size_t k) { return reference(this, i, j, k); }
+
+  // flat index accessors
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j, k;
+    ijk(i, j, k, index);
+    return const_reference(const_cast<container_type*>(this), i, j, k);
+  }
+  reference operator[](size_t index)
+  {
+    size_t i, j, k;
+    ijk(i, j, k, index);
+    return reference(this, i, j, k);
+  }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0, 0, 0); }
+  const_iterator cend() const { return const_iterator(this, 0, 0, nz); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, 0, 0, 0); }
+  iterator end() { return iterator(this, 0, 0, nz); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<array3>;
+  friend class zfp::internal::dim3::const_reference<array3>;
+  friend class zfp::internal::dim3::const_pointer<array3>;
+  friend class zfp::internal::dim3::const_iterator<array3>;
+  friend class zfp::internal::dim3::const_view<array3>;
+  friend class zfp::internal::dim3::private_const_view<array3>;
+  friend class zfp::internal::dim3::reference<array3>;
+  friend class zfp::internal::dim3::pointer<array3>;
+  friend class zfp::internal::dim3::iterator<array3>;
+  friend class zfp::internal::dim3::view<array3>;
+  friend class zfp::internal::dim3::flat_view<array3>;
+  friend class zfp::internal::dim3::nested_view1<array3>;
+  friend class zfp::internal::dim3::nested_view2<array3>;
+  friend class zfp::internal::dim3::nested_view3<array3>;
+  friend class zfp::internal::dim3::private_view<array3>;
+
+  // perform a deep copy
+  void deep_copy(const array3& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+  size_t min_y() const { return 0; }
+  size_t max_y() const { return ny; }
+  size_t min_z() const { return 0; }
+  size_t max_z() const { return nz; }
+
+  // inspector
+  value_type get(size_t i, size_t j, size_t k) const { return cache.get(i, j, k); }
+
+  // mutators (called from proxy reference)
+  void set(size_t i, size_t j, size_t k, value_type val) { cache.set(i, j, k, val); }
+  void add(size_t i, size_t j, size_t k, value_type val) { cache.ref(i, j, k) += val; }
+  void sub(size_t i, size_t j, size_t k, value_type val) { cache.ref(i, j, k) -= val; }
+  void mul(size_t i, size_t j, size_t k, value_type val) { cache.ref(i, j, k) *= val; }
+  void div(size_t i, size_t j, size_t k, value_type val) { cache.ref(i, j, k) /= val; }
+
+  // convert flat index to (i, j, k)
+  void ijk(size_t& i, size_t& j, size_t& k, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index;
+  }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef array3<float> array3f;
+typedef array3<double> array3d;
+
+}
+
+#endif
diff --git a/include/zfp/array4.hpp b/include/zfp/array4.hpp
new file mode 100644
index 00000000..19c1d811
--- /dev/null
+++ b/include/zfp/array4.hpp
@@ -0,0 +1,331 @@
+#ifndef ZFP_ARRAY4_HPP
+#define ZFP_ARRAY4_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache4.hpp"
+#include "zfp/internal/array/handle4.hpp"
+#include "zfp/internal/array/iterator4.hpp"
+#include "zfp/internal/array/pointer4.hpp"
+#include "zfp/internal/array/reference4.hpp"
+#include "zfp/internal/array/store4.hpp"
+#include "zfp/internal/array/view4.hpp"
+
+namespace zfp {
+
+// compressed 3D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp4<Scalar>,
+  class Index = zfp::index::implicit
+>
+class array4 : public array {
+public:
+  // types utilized by nested classes
+  typedef array4 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore4<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache4<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim4::const_reference<array4> const_reference;
+  typedef zfp::internal::dim4::const_pointer<array4> const_pointer;
+  typedef zfp::internal::dim4::const_iterator<array4> const_iterator;
+  typedef zfp::internal::dim4::const_view<array4> const_view;
+  typedef zfp::internal::dim4::private_const_view<array4> private_const_view;
+  typedef zfp::internal::dim4::reference<array4> reference;
+  typedef zfp::internal::dim4::pointer<array4> pointer;
+  typedef zfp::internal::dim4::iterator<array4> iterator;
+  typedef zfp::internal::dim4::view<array4> view;
+  typedef zfp::internal::dim4::flat_view<array4> flat_view;
+  typedef zfp::internal::dim4::nested_view1<array4> nested_view1;
+  typedef zfp::internal::dim4::nested_view2<array4> nested_view2;
+  typedef zfp::internal::dim4::nested_view3<array4> nested_view3;
+  typedef zfp::internal::dim4::nested_view4<array4> nested_view4;
+  typedef zfp::internal::dim4::nested_view4<array4> nested_view;
+  typedef zfp::internal::dim4::private_view<array4> private_view;
+
+  // default constructor
+  array4() :
+    array(4, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx * ny * nz * nw array using rate bits per value, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  array4(size_t nx, size_t ny, size_t nz, size_t nw, double rate, const value_type* p = 0, size_t cache_size = 0) :
+    array(4, Codec::type),
+    store(nx, ny, nz, nw, zfp_config_rate(rate, true)),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    this->nw = nw;
+    if (p)
+      set(p);
+  }
+
+  // constructor, from previously-serialized compressed array
+  array4(const zfp::array::header& header, const void* buffer = 0, size_t buffer_size_bytes = 0) :
+    array(4, Codec::type, header),
+    store(header.size_x(), header.size_y(), header.size_z(), header.size_w(), zfp_config_rate(header.rate(), true)),
+    cache(store)
+  {
+    if (buffer) {
+      if (buffer_size_bytes && buffer_size_bytes < store.compressed_size())
+        throw zfp::exception("buffer size is smaller than required");
+      std::memcpy(store.compressed_data(), buffer, store.compressed_size());
+    }
+  }
+
+  // copy constructor--performs a deep copy
+  array4(const array4& a) :
+    array(),
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // construction from view--perform deep copy of (sub)array
+  template <class View>
+  array4(const View& v) :
+    array(4, Codec::type),
+    store(v.size_x(), v.size_y(), v.size_z(), v.size_w(), zfp_config_rate(v.rate(), true)),
+    cache(store)
+  {
+    this->nx = v.size_x();
+    this->ny = v.size_y();
+    this->nz = v.size_z();
+    this->nw = v.size_w();
+    // initialize array in its preferred order
+    for (iterator it = begin(); it != end(); ++it)
+      *it = v(it.i(), it.j(), it.k(), it.l());
+  }
+
+  // virtual destructor
+  virtual ~array4() {}
+
+  // assignment operator--performs a deep copy
+  array4& operator=(const array4& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx * ny * nz * nw; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+  size_t size_w() const { return nw; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, size_t ny, size_t nz, size_t nw, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    this->nw = nw;
+    store.resize(nx, ny, nz, nw, clear);
+  }
+
+  // rate in bits per value
+  double rate() const { return store.rate(); }
+
+  // set rate in bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, true);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    const size_t bw = store.block_size_w();
+    const ptrdiff_t sx = 1;
+    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+    const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+    const ptrdiff_t sw = static_cast<ptrdiff_t>(nx * ny * nz);
+    size_t block_index = 0;
+    for (size_t l = 0; l < bw; l++, p += 4 * sz * ptrdiff_t(nz - bz))
+      for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
+        for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+          for (size_t i = 0; i < bx; i++, p += 4)
+            cache.get_block(block_index++, p, sx, sy, sz, sw);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p)
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    const size_t bw = store.block_size_w();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+      const ptrdiff_t sw = static_cast<ptrdiff_t>(nx * ny * nz);
+      for (size_t l = 0; l < bw; l++, p += 4 * sz * ptrdiff_t(nz - bz))
+        for (size_t k = 0; k < bz; k++, p += 4 * sy * ptrdiff_t(ny - by))
+          for (size_t j = 0; j < by; j++, p += 4 * sx * ptrdiff_t(nx - bx))
+            for (size_t i = 0; i < bx; i++, p += 4)
+              cache.put_block(block_index++, p, sx, sy, sz, sw);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4 * 4 * 4] = {};
+      while (block_index < bx * by * bz * bw)
+        cache.put_block(block_index++, block, 1, 4, 16, 64);
+    }
+  }
+
+  // (i, j, k) accessors
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(const_cast<container_type*>(this), i, j, k, l); }
+  reference operator()(size_t i, size_t j, size_t k, size_t l) { return reference(this, i, j, k, l); }
+
+  // flat index accessors
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j, k, l;
+    ijkl(i, j, k, l, index);
+    return const_reference(const_cast<container_type*>(this), i, j, k, l);
+  }
+  reference operator[](size_t index)
+  {
+    size_t i, j, k, l;
+    ijkl(i, j, k, l, index);
+    return reference(this, i, j, k, l);
+  }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0, 0, 0, 0); }
+  const_iterator cend() const { return const_iterator(this, 0, 0, 0, nw); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, 0, 0, 0, 0); }
+  iterator end() { return iterator(this, 0, 0, 0, nw); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<array4>;
+  friend class zfp::internal::dim4::const_reference<array4>;
+  friend class zfp::internal::dim4::const_pointer<array4>;
+  friend class zfp::internal::dim4::const_iterator<array4>;
+  friend class zfp::internal::dim4::const_view<array4>;
+  friend class zfp::internal::dim4::private_const_view<array4>;
+  friend class zfp::internal::dim4::reference<array4>;
+  friend class zfp::internal::dim4::pointer<array4>;
+  friend class zfp::internal::dim4::iterator<array4>;
+  friend class zfp::internal::dim4::view<array4>;
+  friend class zfp::internal::dim4::flat_view<array4>;
+  friend class zfp::internal::dim4::nested_view1<array4>;
+  friend class zfp::internal::dim4::nested_view2<array4>;
+  friend class zfp::internal::dim4::nested_view3<array4>;
+  friend class zfp::internal::dim4::nested_view4<array4>;
+  friend class zfp::internal::dim4::private_view<array4>;
+
+  // perform a deep copy
+  void deep_copy(const array4& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+  size_t min_y() const { return 0; }
+  size_t max_y() const { return ny; }
+  size_t min_z() const { return 0; }
+  size_t max_z() const { return nz; }
+  size_t min_w() const { return 0; }
+  size_t max_w() const { return nw; }
+
+  // inspector
+  value_type get(size_t i, size_t j, size_t k, size_t l) const { return cache.get(i, j, k, l); }
+
+  // mutators (called from proxy reference)
+  void set(size_t i, size_t j, size_t k, size_t l, value_type val) { cache.set(i, j, k, l, val); }
+  void add(size_t i, size_t j, size_t k, size_t l, value_type val) { cache.ref(i, j, k, l) += val; }
+  void sub(size_t i, size_t j, size_t k, size_t l, value_type val) { cache.ref(i, j, k, l) -= val; }
+  void mul(size_t i, size_t j, size_t k, size_t l, value_type val) { cache.ref(i, j, k, l) *= val; }
+  void div(size_t i, size_t j, size_t k, size_t l, value_type val) { cache.ref(i, j, k, l) /= val; }
+
+  // convert flat index to (i, j, k)
+  void ijkl(size_t& i, size_t& j, size_t& k, size_t& l, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index % nz; index /= nz;
+    l = index;
+  }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef array4<float> array4f;
+typedef array4<double> array4d;
+
+}
+
+#endif
diff --git a/include/bitstream.h b/include/zfp/bitstream.h
similarity index 64%
rename from include/bitstream.h
rename to include/zfp/bitstream.h
index ad5475fe..85922aad 100644
--- a/include/bitstream.h
+++ b/include/zfp/bitstream.h
@@ -2,12 +2,21 @@
 #define ZFP_BITSTREAM_H
 
 #include <stddef.h>
-#include "zfp/types.h"
-#include "zfp/system.h"
+#include "zfp/internal/zfp/types.h"
+#include "zfp/internal/zfp/system.h"
 
 /* forward declaration of opaque type */
 typedef struct bitstream bitstream;
 
+/* bit offset into stream where bits are read/written */
+typedef uint64 bitstream_offset;
+
+/* type for counting number of bits in a stream */
+typedef bitstream_offset bitstream_size;
+
+/* type for counting a small number of bits in a stream */
+typedef size_t bitstream_count;
+
 extern_ const size_t stream_word_bits; /* bit stream granularity */
 
 #ifndef inline_
@@ -24,6 +33,9 @@ void stream_close(bitstream* stream);
 /* make a copy of bit stream to shared memory buffer */
 bitstream* stream_clone(const bitstream* stream);
 
+/* word size in bits (equal to stream_word_bits) */
+bitstream_count stream_alignment();
+
 /* pointer to beginning of stream */
 void* stream_data(const bitstream* stream);
 
@@ -46,40 +58,40 @@ uint stream_read_bit(bitstream* stream);
 uint stream_write_bit(bitstream* stream, uint bit);
 
 /* read 0 <= n <= 64 bits */
-uint64 stream_read_bits(bitstream* stream, uint n);
+uint64 stream_read_bits(bitstream* stream, bitstream_count n);
 
 /* write 0 <= n <= 64 low bits of value and return remaining bits */
-uint64 stream_write_bits(bitstream* stream, uint64 value, uint n);
+uint64 stream_write_bits(bitstream* stream, uint64 value, bitstream_count n);
 
 /* return bit offset to next bit to be read */
-size_t stream_rtell(const bitstream* stream);
+bitstream_offset stream_rtell(const bitstream* stream);
 
 /* return bit offset to next bit to be written */
-size_t stream_wtell(const bitstream* stream);
+bitstream_offset stream_wtell(const bitstream* stream);
 
 /* rewind stream to beginning */
 void stream_rewind(bitstream* stream);
 
 /* position stream for reading at given bit offset */
-void stream_rseek(bitstream* stream, size_t offset);
+void stream_rseek(bitstream* stream, bitstream_offset offset);
 
 /* position stream for writing at given bit offset */
-void stream_wseek(bitstream* stream, size_t offset);
+void stream_wseek(bitstream* stream, bitstream_offset offset);
 
 /* skip over the next n bits */
-void stream_skip(bitstream* stream, uint n);
+void stream_skip(bitstream* stream, bitstream_size n);
 
 /* append n zero-bits to stream */
-void stream_pad(bitstream* stream, uint n);
+void stream_pad(bitstream* stream, bitstream_size n);
 
 /* align stream on next word boundary */
-size_t stream_align(bitstream* stream);
+bitstream_count stream_align(bitstream* stream);
 
 /* flush out any remaining buffered bits */
-size_t stream_flush(bitstream* stream);
+bitstream_count stream_flush(bitstream* stream);
 
 /* copy n bits from one bit stream to another */
-void stream_copy(bitstream* dst, bitstream* src, size_t n);
+void stream_copy(bitstream* dst, bitstream* src, bitstream_size n);
 
 #ifdef BIT_STREAM_STRIDED
 /* set block size in number of words and spacing in number of blocks */
diff --git a/src/inline/bitstream.c b/include/zfp/bitstream.inl
similarity index 73%
rename from src/inline/bitstream.c
rename to include/zfp/bitstream.inl
index aa58b73f..987f5a42 100644
--- a/src/inline/bitstream.c
+++ b/include/zfp/bitstream.inl
@@ -22,35 +22,36 @@ The following assumptions and restrictions apply:
    stream for writing.  In read mode, the following functions may be called:
 
      size_t stream_size(stream);
-     size_t stream_rtell(stream);
+     bitstream_offset stream_rtell(stream);
      void stream_rewind(stream);
      void stream_rseek(stream, offset);
-     void stream_skip(stream, uint n);
-     size_t stream_align(stream);
+     void stream_skip(stream, n);
+     bitstream_count stream_align(stream);
      uint stream_read_bit(stream);
      uint64 stream_read_bits(stream, n);
 
    Each of the above read calls has a corresponding write call:
 
      size_t stream_size(stream);
-     size_t stream_wtell(stream);
+     bitstream_offset stream_wtell(stream);
      void stream_rewind(stream);
      void stream_wseek(stream, offset);
      void stream_pad(stream, n);
-     size_t stream_flush(stream);
+     bitstream_count stream_flush(stream);
      uint stream_write_bit(stream, bit);
      uint64 stream_write_bits(stream, value, n);
 
 3. The stream buffer is an unsigned integer of a user-specified type given
    by the BIT_STREAM_WORD_TYPE macro.  Bits are read and written in units of
    this integer word type.  Supported types are 8, 16, 32, or 64 bits wide.
-   The bit width of the buffer is denoted by 'wsize' and can be accessed via
-   the global constant stream_word_bits.  A small wsize allows for fine
-   granularity reads and writes, and may be preferable when working with many
-   small blocks of data that require non-sequential access.  The default
-   maximum size of 64 bits ensures maximum speed.  Note that even when
-   wsize < 64, it is still possible to read and write up to 64 bits at a time
-   using stream_read_bits() and stream_write_bits().
+   The bit width of the buffer is denoted by 'wsize' and can be accessed
+   either via the global constant stream_word_bits or stream_alignment().
+   A small wsize allows for fine granularity reads and writes, and may be
+   preferable when working with many small blocks of data that require
+   non-sequential access.  The default maximum size of 64 bits ensures maximum
+   speed.  Note that even when wsize < 64, it is still possible to read and
+   write up to 64 bits at a time using stream_read_bits() and
+   stream_write_bits().
 
 4. If BIT_STREAM_STRIDED is defined, words read from or written to the stream
    may be accessed noncontiguously by setting a power-of-two block size (which
@@ -58,7 +59,7 @@ The following assumptions and restrictions apply:
    word pointer is always incremented by one word each time a word is accessed.
    Once advanced past a block boundary, the word pointer is also advanced by
    the stride to the next block.  This feature may be used to store blocks of
-   data interleaved, e.g. for progressive coding or for noncontiguous parallel
+   data interleaved, e.g., for progressive coding or for noncontiguous parallel
    access to the bit stream  Note that the block size is measured in words,
    while the stride is measured in multiples of the block size.  Strided access
    can have a significant performance penalty.
@@ -71,7 +72,7 @@ The following assumptions and restrictions apply:
    is essentially equivalent to (but faster than)
 
        for (i = 0; i < n; i++, value >>= 1)
-         stream_write_bit(value & 1);
+         stream_write_bit(stream, value & 1);
 
    when 0 <= n <= 64.  The same holds for read calls, and thus
 
@@ -80,11 +81,15 @@ The following assumptions and restrictions apply:
    is essentially equivalent to
 
        for (i = 0, value = 0; i < n; i++)
-         value += (uint64)stream_read_bit() << i;
+         value += (uint64)stream_read_bit(stream) << i;
 
    Note that it is possible to write fewer bits than the argument 'value'
    holds (possibly even no bits), in which case any unwritten bits are
-   returned.
+   shifted right to the least significant position and returned.  That is,
+   value = stream_write_bits(stream, value, n); is equivalent to
+
+       for (i = 0; i < n; i++)
+         value = stream_write_bits(stream, value, 1);
 
 6. Although the stream_wseek(stream, offset) call allows positioning the
    stream for writing at any bit offset without any data loss (i.e. all
@@ -107,41 +112,43 @@ The following assumptions and restrictions apply:
   #define inline_
 #endif
 
+#include "zfp/bitstream.h"
+
 /* satisfy compiler when args unused */
 #define unused_(x) ((void)(x))
 
 /* bit stream word/buffer type; granularity of stream I/O operations */
 #ifdef BIT_STREAM_WORD_TYPE
   /* may be 8-, 16-, 32-, or 64-bit unsigned integer type */
-  typedef BIT_STREAM_WORD_TYPE word;
+  typedef BIT_STREAM_WORD_TYPE bitstream_word;
 #else
   /* use maximum word size by default for highest speed */
-  typedef uint64 word;
+  typedef uint64 bitstream_word;
 #endif
 
 /* number of bits in a buffered word */
-#define wsize ((uint)(CHAR_BIT * sizeof(word)))
+#define wsize ((bitstream_count)(sizeof(bitstream_word) * CHAR_BIT))
 
 /* bit stream structure (opaque to caller) */
 struct bitstream {
-  uint bits;   /* number of buffered bits (0 <= bits < wsize) */
-  word buffer; /* buffer for incoming/outgoing bits (buffer < 2^bits) */
-  word* ptr;   /* pointer to next word to be read/written */
-  word* begin; /* beginning of stream */
-  word* end;   /* end of stream (currently unused) */
+  bitstream_count bits;  /* number of buffered bits (0 <= bits < wsize) */
+  bitstream_word buffer; /* incoming/outgoing bits (buffer < 2^bits) */
+  bitstream_word* ptr;   /* pointer to next word to be read/written */
+  bitstream_word* begin; /* beginning of stream */
+  bitstream_word* end;   /* end of stream (not enforced) */
 #ifdef BIT_STREAM_STRIDED
-  size_t mask;     /* one less the block size in number of words */
-  ptrdiff_t delta; /* number of words between consecutive blocks */
+  size_t mask;           /* one less the block size in number of words */
+  ptrdiff_t delta;       /* number of words between consecutive blocks */
 #endif
 };
 
 /* private functions ------------------------------------------------------- */
 
 /* read a single word from memory */
-static word
+static bitstream_word
 stream_read_word(bitstream* s)
 {
-  word w = *s->ptr++;
+  bitstream_word w = *s->ptr++;
 #ifdef BIT_STREAM_STRIDED
   if (!((s->ptr - s->begin) & s->mask))
     s->ptr += s->delta;
@@ -151,7 +158,7 @@ stream_read_word(bitstream* s)
 
 /* write a single word to memory */
 static void
-stream_write_word(bitstream* s, word value)
+stream_write_word(bitstream* s, bitstream_word value)
 {
   *s->ptr++ = value;
 #ifdef BIT_STREAM_STRIDED
@@ -162,6 +169,13 @@ stream_write_word(bitstream* s, word value)
 
 /* public functions -------------------------------------------------------- */
 
+/* word size in bits (equals bitstream_word_bits) */
+inline_ bitstream_count
+stream_alignment()
+{
+  return wsize;
+}
+
 /* pointer to beginning of stream */
 inline_ void*
 stream_data(const bitstream* s)
@@ -173,14 +187,14 @@ stream_data(const bitstream* s)
 inline_ size_t
 stream_size(const bitstream* s)
 {
-  return sizeof(word) * (s->ptr - s->begin);
+  return (size_t)(s->ptr - s->begin) * sizeof(bitstream_word);
 }
 
 /* byte capacity of stream */
 inline_ size_t
 stream_capacity(const bitstream* s)
 {
-  return sizeof(word) * (s->end - s->begin);
+  return (size_t)(s->end - s->begin) * sizeof(bitstream_word);
 }
 
 /* number of words per block */
@@ -226,7 +240,7 @@ stream_read_bit(bitstream* s)
 inline_ uint
 stream_write_bit(bitstream* s, uint bit)
 {
-  s->buffer += (word)bit << s->bits;
+  s->buffer += (bitstream_word)bit << s->bits;
   if (++s->bits == wsize) {
     stream_write_word(s, s->buffer);
     s->buffer = 0;
@@ -237,7 +251,7 @@ stream_write_bit(bitstream* s, uint bit)
 
 /* read 0 <= n <= 64 bits */
 inline_ uint64
-stream_read_bits(bitstream* s, uint n)
+stream_read_bits(bitstream* s, bitstream_count n)
 {
   uint64 value = s->buffer;
   if (s->bits < n) {
@@ -272,10 +286,10 @@ stream_read_bits(bitstream* s, uint n)
 
 /* write 0 <= n <= 64 low bits of value and return remaining bits */
 inline_ uint64
-stream_write_bits(bitstream* s, uint64 value, uint n)
+stream_write_bits(bitstream* s, uint64 value, bitstream_count n)
 {
   /* append bit string to buffer */
-  s->buffer += (word)(value << s->bits);
+  s->buffer += (bitstream_word)(value << s->bits);
   s->bits += n;
   /* is buffer full? */
   if (s->bits >= wsize) {
@@ -289,27 +303,27 @@ stream_write_bits(bitstream* s, uint64 value, uint n)
       /* assert: 0 <= s->bits <= n */
       stream_write_word(s, s->buffer);
       /* assert: 0 <= n - s->bits < 64 */
-      s->buffer = (word)(value >> (n - s->bits));
+      s->buffer = (bitstream_word)(value >> (n - s->bits));
     } while (sizeof(s->buffer) < sizeof(value) && s->bits >= wsize);
   }
   /* assert: 0 <= s->bits < wsize */
-  s->buffer &= ((word)1 << s->bits) - 1;
+  s->buffer &= ((bitstream_word)1 << s->bits) - 1;
   /* assert: 0 <= n < 64 */
   return value >> n;
 }
 
 /* return bit offset to next bit to be read */
-inline_ size_t
+inline_ bitstream_offset
 stream_rtell(const bitstream* s)
 {
-  return wsize * (s->ptr - s->begin) - s->bits;
+  return (bitstream_offset)(s->ptr - s->begin) * wsize - s->bits;
 }
 
 /* return bit offset to next bit to be written */
-inline_ size_t
+inline_ bitstream_offset
 stream_wtell(const bitstream* s)
 {
-  return wsize * (s->ptr - s->begin) + s->bits;
+  return (bitstream_offset)(s->ptr - s->begin) * wsize + s->bits;
 }
 
 /* position stream for reading or writing at beginning */
@@ -323,10 +337,10 @@ stream_rewind(bitstream* s)
 
 /* position stream for reading at given bit offset */
 inline_ void
-stream_rseek(bitstream* s, size_t offset)
+stream_rseek(bitstream* s, bitstream_offset offset)
 {
-  uint n = offset % wsize;
-  s->ptr = s->begin + offset / wsize;
+  bitstream_count n = (bitstream_count)(offset % wsize);
+  s->ptr = s->begin + (size_t)(offset / wsize);
   if (n) {
     s->buffer = stream_read_word(s) >> n;
     s->bits = wsize - n;
@@ -339,13 +353,13 @@ stream_rseek(bitstream* s, size_t offset)
 
 /* position stream for writing at given bit offset */
 inline_ void
-stream_wseek(bitstream* s, size_t offset)
+stream_wseek(bitstream* s, bitstream_offset offset)
 {
-  uint n = offset % wsize;
-  s->ptr = s->begin + offset / wsize;
+  bitstream_count n = (bitstream_count)(offset % wsize);
+  s->ptr = s->begin + (size_t)(offset / wsize);
   if (n) {
-    word buffer = *s->ptr;
-    buffer &= ((word)1 << n) - 1;
+    bitstream_word buffer = *s->ptr;
+    buffer &= ((bitstream_word)1 << n) - 1;
     s->buffer = buffer;
     s->bits = n;
   }
@@ -357,36 +371,38 @@ stream_wseek(bitstream* s, size_t offset)
 
 /* skip over the next n bits (n >= 0) */
 inline_ void
-stream_skip(bitstream* s, uint n)
+stream_skip(bitstream* s, bitstream_size n)
 {
   stream_rseek(s, stream_rtell(s) + n);
 }
 
 /* append n zero-bits to stream (n >= 0) */
 inline_ void
-stream_pad(bitstream* s, uint n)
+stream_pad(bitstream* s, bitstream_size n)
 {
-  for (s->bits += n; s->bits >= wsize; s->bits -= wsize) {
+  bitstream_offset bits = s->bits;
+  for (bits += n; bits >= wsize; bits -= wsize) {
     stream_write_word(s, s->buffer);
     s->buffer = 0;
   }
+  s->bits = (bitstream_count)bits;
 }
 
 /* align stream on next word boundary */
-inline_ size_t
+inline_ bitstream_count
 stream_align(bitstream* s)
 {
-  uint bits = s->bits;
+  bitstream_count bits = s->bits;
   if (bits)
     stream_skip(s, bits);
   return bits;
 }
 
 /* write any remaining buffered bits and align stream on next word boundary */
-inline_ size_t
+inline_ bitstream_count
 stream_flush(bitstream* s)
 {
-  uint bits = (wsize - s->bits) % wsize;
+  bitstream_count bits = (wsize - s->bits) % wsize;
   if (bits)
     stream_pad(s, bits);
   return bits;
@@ -394,16 +410,16 @@ stream_flush(bitstream* s)
 
 /* copy n bits from one bit stream to another */
 inline_ void
-stream_copy(bitstream* dst, bitstream* src, size_t n)
+stream_copy(bitstream* dst, bitstream* src, bitstream_size n)
 {
   while (n > wsize) {
-    word w = (word)stream_read_bits(src, wsize);
+    bitstream_word w = (bitstream_word)stream_read_bits(src, wsize);
     stream_write_bits(dst, w, wsize);
     n -= wsize;
   }
   if (n) {
-    word w = (word)stream_read_bits(src, (uint)n);
-    stream_write_bits(dst, w, (uint)n);
+    bitstream_word w = (bitstream_word)stream_read_bits(src, (bitstream_count)n);
+    stream_write_bits(dst, w, (bitstream_count)n);
   }
 }
 
@@ -427,8 +443,8 @@ stream_open(void* buffer, size_t bytes)
 {
   bitstream* s = (bitstream*)malloc(sizeof(bitstream));
   if (s) {
-    s->begin = (word*)buffer;
-    s->end = s->begin + bytes / sizeof(word);
+    s->begin = (bitstream_word*)buffer;
+    s->end = s->begin + bytes / sizeof(bitstream_word);
 #ifdef BIT_STREAM_STRIDED
     stream_set_stride(s, 0, 0);
 #endif
diff --git a/include/zfp/codec/gencodec.hpp b/include/zfp/codec/gencodec.hpp
new file mode 100644
index 00000000..b0eb3230
--- /dev/null
+++ b/include/zfp/codec/gencodec.hpp
@@ -0,0 +1,421 @@
+#ifndef ZFP_GENERIC_CODEC_HPP
+#define ZFP_GENERIC_CODEC_HPP
+
+// This CODEC allows interfacing with the zfp::array classes via a user-facing
+// scalar type, ExternalType (e.g., double), while storing data in memory using
+// a possibly less precise scalar type, InternalType (e.g., float).  Using
+// zfp's caching mechanism, blocks of data may reside for some time in cache
+// as ExternalType.  This potentially allows a sequence of more precise
+// operations to be performed on the data before it is down-converted to
+// InternalType and stored to memory.  When ExternalType = InternalType, this
+// CODEC allows defining arrays that support the full zfp array API but use
+// uncompressed storage.  To use this CODEC, pass it as the Codec template
+// parameter to a zfp::array class of matching dimensionality.
+
+#include <algorithm>
+#include <climits>
+#include <cstring>
+#include "zfp.h"
+#include "zfp/internal/array/memory.hpp"
+#include "zfp/internal/array/traits.hpp"
+
+namespace zfp {
+namespace codec {
+
+// abstract base class for storing 1D-4D uncompressed blocks of scalars
+template <
+  uint dims,                           // data dimensionality (1-4)
+  typename ExternalType,               // scalar type exposed through array API
+  typename InternalType = ExternalType // scalar type used for storage
+>
+class generic_base {
+protected:
+  // default constructor
+  generic_base() :
+    bytes(0),
+    buffer(0)
+  {}
+
+public:
+  // conservative buffer size for current codec settings
+  size_t buffer_size(const zfp_field* field) const
+  {
+    return zfp_field_blocks(field) * block_size * sizeof(InternalType);
+  }
+
+  // open 
+  void open(void* data, size_t size)
+  {
+    bytes = size;
+    buffer = static_cast<InternalType*>(data);
+  }
+
+  // close bit stream
+  void close()
+  {
+    bytes = 0;
+    buffer = 0;
+  }
+
+  // pointer to beginning of bit stream
+  void* data() const { return static_cast<void*>(buffer); }
+
+  // compression mode
+  zfp_mode mode() const { return zfp_mode_fixed_rate; }
+
+  // rate in compressed bits/value (equals precision)
+  double rate() const { return static_cast<double>(precision()); }
+
+  // precision in uncompressed bits/value
+  uint precision() const { return internal_size_bits; }
+
+  // accuracy as absolute error tolerance (unsupported)
+  double accuracy() const { return -1; }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const
+  {
+    if (minbits)
+      *minbits = block_size_bits;
+    if (maxbits)
+      *maxbits = block_size_bits;
+    if (maxprec)
+      *maxprec = precision();
+    if (minexp)
+      *minexp = ZFP_MIN_EXP;
+  }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    throw zfp::exception("zfp generic codec does not support reversible mode");
+  }
+
+  // set rate in compressed bits/value (equals precision)
+  double set_rate(double rate, bool)
+  {
+    return static_cast<double>(set_precision(static_cast<uint>(rate)));
+  }
+
+  // set precision in uncompressed bits/value (must equal InternalType width)
+  uint set_precision(uint precision)
+  {
+    if (precision != internal_size_bits)
+      throw zfp::exception("zfp generic codec precision mismatch");
+    return precision;
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double)
+  {
+    throw zfp::exception("zfp generic codec does not support fixed-accuracy mode");
+    return -1;
+  }
+
+  // set expert mode parameters
+  bool set_params(uint, uint, uint, int)
+  {
+    throw zfp::exception("zfp generic codec does not support expert mode");
+    return false;
+  }
+
+  // set thread safety mode (not required by this codec)
+  void set_thread_safety(bool) {}
+
+  // byte size of codec data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // unit of allocated data in bytes
+  static size_t alignment() { return sizeof(InternalType); }
+
+  static const zfp_type type = zfp::internal::trait<ExternalType>::type; // scalar type
+
+  // zfp::codec::generic_base::header class for array (de)serialization
+  #include "zfp/internal/codec/genheader.hpp"
+
+protected:
+  // pointer to beginning of block
+  InternalType* begin(bitstream_offset offset) const
+  {
+    if (offset % internal_size_bits)
+      throw zfp::exception("zfp generic codec bit offset alignment error");
+    return buffer + offset / internal_size_bits;
+  }
+
+  // store full contiguous block to memory
+  size_t encode_block(bitstream_offset offset, const ExternalType* block) const
+  {
+    InternalType* ptr = begin(offset);
+    for (size_t n = block_size; n--;)
+      *ptr++ = static_cast<InternalType>(*block++);
+    return block_size_bits;
+  }
+
+  // load full contiguous block from memory
+  size_t decode_block(bitstream_offset offset, ExternalType* block) const
+  {
+    const InternalType* ptr = begin(offset);
+    for (size_t n = block_size; n--;)
+      *block++ = static_cast<ExternalType>(*ptr++);
+    return block_size_bits;
+  }
+
+  // constants associated with template arguments
+  static const size_t internal_size_bits = sizeof(InternalType) * CHAR_BIT;
+  static const size_t block_size = 1u << (2 * dims);
+  static const size_t block_size_bits = block_size * internal_size_bits;
+
+  size_t bytes;         // number of bytes of storage
+  InternalType* buffer; // pointer to storage managed by block store
+};
+
+// 1D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic1 : public generic_base<1, ExternalType, InternalType> {
+public:
+  // encode contiguous 1D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 1D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1)
+                 : decode_block(offset, block);
+  }
+
+  // encode 1D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+    }
+    for (size_t x = 0; x < nx; x++, p += sx, q++)
+      *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 1D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+    }
+    for (size_t x = 0; x < nx; x++, p += sx, q++)
+      *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<1, ExternalType, InternalType>::begin;
+  using generic_base<1, ExternalType, InternalType>::encode_block;
+  using generic_base<1, ExternalType, InternalType>::decode_block;
+  using generic_base<1, ExternalType, InternalType>::block_size_bits;
+};
+
+// 2D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic2 : public generic_base<2, ExternalType, InternalType> {
+public:
+  // encode contiguous 2D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 2D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4)
+                 : decode_block(offset, block);
+  }
+
+  // encode 2D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+    }
+    for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+      for (size_t x = 0; x < nx; x++, p += sx, q++)
+        *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 2D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+    }
+    for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+      for (size_t x = 0; x < nx; x++, p += sx, q++)
+        *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<2, ExternalType, InternalType>::begin;
+  using generic_base<2, ExternalType, InternalType>::encode_block;
+  using generic_base<2, ExternalType, InternalType>::decode_block;
+  using generic_base<2, ExternalType, InternalType>::block_size_bits;
+};
+
+// 3D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic3 : public generic_base<3, ExternalType, InternalType> {
+public:
+  // encode contiguous 3D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 3D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16)
+                 : decode_block(offset, block);
+  }
+
+  // encode 3D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+    }
+    for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+      for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+        for (size_t x = 0; x < nx; x++, p += sx, q++)
+          *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 3D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+    }
+    for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+      for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+        for (size_t x = 0; x < nx; x++, p += sx, q++)
+          *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<3, ExternalType, InternalType>::begin;
+  using generic_base<3, ExternalType, InternalType>::encode_block;
+  using generic_base<3, ExternalType, InternalType>::decode_block;
+  using generic_base<3, ExternalType, InternalType>::block_size_bits;
+};
+
+// 4D codec
+template <typename ExternalType, typename InternalType = ExternalType>
+class generic4 : public generic_base<4, ExternalType, InternalType> {
+public:
+  // encode contiguous 4D block
+  size_t encode_block(bitstream_offset offset, uint shape, const ExternalType* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 4D block
+  size_t decode_block(bitstream_offset offset, uint shape, ExternalType* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : decode_block(offset, block);
+  }
+
+  // encode 4D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    size_t nw = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+      nw -= shape & 3u; shape >>= 2;
+    }
+    for (size_t w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 64 - 16 * nz)
+      for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+        for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (size_t x = 0; x < nx; x++, p += sx, q++)
+            *q = static_cast<InternalType>(*p);
+    return block_size_bits;
+  }
+
+  // decode 4D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, ExternalType* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    const InternalType* q = begin(offset);
+    size_t nx = 4;
+    size_t ny = 4;
+    size_t nz = 4;
+    size_t nw = 4;
+    if (shape) {
+      nx -= shape & 3u; shape >>= 2;
+      ny -= shape & 3u; shape >>= 2;
+      nz -= shape & 3u; shape >>= 2;
+      nw -= shape & 3u; shape >>= 2;
+    }
+    for (size_t w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 64 - 16 * nz)
+      for (size_t z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+        for (size_t y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (size_t x = 0; x < nx; x++, p += sx, q++)
+            *p = static_cast<ExternalType>(*q);
+    return block_size_bits;
+  }
+
+protected:
+  using generic_base<4, ExternalType, InternalType>::begin;
+  using generic_base<4, ExternalType, InternalType>::encode_block;
+  using generic_base<4, ExternalType, InternalType>::decode_block;
+  using generic_base<4, ExternalType, InternalType>::block_size_bits;
+};
+
+} // codec
+} // zfp
+
+#endif
diff --git a/include/zfp/codec/zfpcodec.hpp b/include/zfp/codec/zfpcodec.hpp
new file mode 100644
index 00000000..5a880cdd
--- /dev/null
+++ b/include/zfp/codec/zfpcodec.hpp
@@ -0,0 +1,551 @@
+#ifndef ZFP_ZFP_CODEC_HPP
+#define ZFP_ZFP_CODEC_HPP
+
+#include <algorithm>
+#include <climits>
+#include <cstring>
+#include "zfp.h"
+#include "zfp.hpp"
+#include "zfp/internal/array/memory.hpp"
+#include "zfp/internal/array/traits.hpp"
+
+namespace zfp {
+namespace codec {
+
+// abstract base class for zfp coding of {float, double} x {1D, 2D, 3D, 4D} data
+template <uint dims, typename Scalar>
+class zfp_base {
+protected:
+  // default constructor
+  zfp_base() :
+    stream(zfp_stream_open(0))
+#ifdef _OPENMP
+    , thread_safety(false)
+#endif
+  {}
+
+  // destructor
+  ~zfp_base()
+  {
+    close();
+    zfp_stream_close(stream);
+  }
+
+public:
+  // assignment operator--performs deep copy
+  zfp_base& operator=(const zfp_base& codec)
+  {
+    if (this != &codec)
+      deep_copy(codec);
+    return *this;
+  }
+
+  // conservative buffer size for current codec settings
+  size_t buffer_size(const zfp_field* field) const
+  {
+    // empty field case
+    if (!field->nx && !field->ny && !field->nz && !field->nw)
+      return 0;
+    // variable-rate case
+    if (zfp_stream_compression_mode(stream) != zfp_mode_fixed_rate)
+      return zfp_stream_maximum_size(stream, field);
+    // fixed-rate case: exclude header
+    size_t blocks = zfp_field_blocks(field);
+    return zfp::internal::round_up(blocks * stream->maxbits, stream_alignment()) / CHAR_BIT;
+  }
+
+  // open bit stream
+  void open(void* data, size_t size)
+  {
+    zfp_stream_set_bit_stream(stream, stream_open(data, size));
+  }
+
+  // close bit stream
+  void close()
+  {
+    stream_close(zfp_stream_bit_stream(stream));
+    zfp_stream_set_bit_stream(stream, 0);
+  }
+
+  // compression mode
+  zfp_mode mode() const { return zfp_stream_compression_mode(stream); }
+
+  // rate in compressed bits/value (fixed-rate mode only)
+  double rate() const { return zfp_stream_rate(stream, dims); }
+
+  // precision in uncompressed bits/value (fixed-precision mode only)
+  uint precision() const { return zfp_stream_precision(stream); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return zfp_stream_accuracy(stream); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { zfp_stream_params(stream, minbits, maxbits, maxprec, minexp); }
+
+  // enable reversible (lossless) mode
+  void set_reversible() { zfp_stream_set_reversible(stream); }
+
+  // set rate in compressed bits/value
+  double set_rate(double rate, bool align) { return zfp_stream_set_rate(stream, rate, type, dims, align); }
+
+  // set precision in uncompressed bits/value
+  uint set_precision(uint precision) { return zfp_stream_set_precision(stream, precision); }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance) { return zfp_stream_set_accuracy(stream, tolerance); }
+
+  // set expert mode parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int maxexp) { return zfp_stream_set_params(stream, minbits, maxbits, maxprec, maxexp) == zfp_true; }
+
+  // set thread safety mode
+#ifdef _OPENMP
+  void set_thread_safety(bool safety) { thread_safety = safety; }
+#else
+  void set_thread_safety(bool) {}
+#endif
+
+  // byte size of codec data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META) {
+      size += sizeof(*stream);
+      size += sizeof(*this);
+    }
+    return size;
+  }
+
+  // unit of allocated data in bytes
+  static size_t alignment() { return stream_alignment() / CHAR_BIT; }
+
+  static const zfp_type type = zfp::internal::trait<Scalar>::type; // scalar type
+
+  // zfp::codec::zfp_base::header class for array (de)serialization
+  #include "zfp/internal/codec/zfpheader.hpp"
+
+protected:
+  // deep copy
+  void deep_copy(const zfp_base& codec)
+  {
+    stream = zfp_stream_open(0);
+    *stream = *codec.stream;
+    stream->stream = 0;
+#ifdef _OPENMP
+    thread_safety = codec.thread_safety;
+#endif
+  }
+
+  // make a thread-local copy of zfp stream and bit stream
+  zfp_stream clone_stream() const
+  {
+    zfp_stream zfp = *stream;
+    zfp.stream = stream_clone(zfp.stream);
+    return zfp;
+  }
+
+  // encode full contiguous block
+  size_t encode_block(bitstream_offset offset, const Scalar* block) const
+  {
+    if (thread_safety) {
+      // make a thread-local copy of zfp stream and bit stream
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block(&zfp, offset, block);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block(stream, offset, block);
+  }
+
+  // decode full contiguous block
+  size_t decode_block(bitstream_offset offset, Scalar* block) const
+  {
+    if (thread_safety) {
+      // make a thread-local copy of zfp stream and bit stream
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block(&zfp, offset, block);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block(stream, offset, block);
+  }
+
+  // encode full contiguous block
+  static size_t encode_block(zfp_stream* zfp, bitstream_offset offset, const Scalar* block)
+  {
+    stream_wseek(zfp->stream, offset);
+    size_t size = zfp::encode_block<Scalar, dims>(zfp, block);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode full contiguous block
+  static size_t decode_block(zfp_stream* zfp, bitstream_offset offset, Scalar* block)
+  {
+    stream_rseek(zfp->stream, offset);
+    size_t size = zfp::decode_block<Scalar, dims>(zfp, block);
+    stream_align(zfp->stream);
+    return size;
+  }
+
+  zfp_stream* stream; // compressed zfp stream
+#ifdef _OPENMP
+  bool thread_safety; // thread safety state
+#else
+  static const bool thread_safety = false; // not needed without OpenMP
+#endif
+};
+
+// 1D codec
+template <typename Scalar>
+class zfp1 : public zfp_base<1, Scalar> {
+public:
+  // encode contiguous 1D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 1D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1)
+                 : decode_block(offset, block);
+  }
+
+  // encode 1D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx);
+  }
+
+  // decode 1D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx);
+  }
+
+protected:
+  using zfp_base<1, Scalar>::clone_stream;
+  using zfp_base<1, Scalar>::encode_block;
+  using zfp_base<1, Scalar>::decode_block;
+  using zfp_base<1, Scalar>::stream;
+  using zfp_base<1, Scalar>::thread_safety;
+
+  // encode 1D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, sx);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 1D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, sx);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+// 2D codec
+template <typename Scalar>
+class zfp2 : public zfp_base<2, Scalar> {
+public:
+  // encode contiguous 2D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 2D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4)
+                 : decode_block(offset, block);
+  }
+
+  // encode 2D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx, sy);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx, sy);
+  }
+
+  // decode 2D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx, sy);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx, sy);
+  }
+
+protected:
+  using zfp_base<2, Scalar>::clone_stream;
+  using zfp_base<2, Scalar>::encode_block;
+  using zfp_base<2, Scalar>::decode_block;
+  using zfp_base<2, Scalar>::stream;
+  using zfp_base<2, Scalar>::thread_safety;
+
+  // encode 2D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 2D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, sx, sy);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+// 3D codec
+template <typename Scalar>
+class zfp3 : public zfp_base<3, Scalar> {
+public:
+  // encode contiguous 3D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 3D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16)
+                 : decode_block(offset, block);
+  }
+
+  // encode 3D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx, sy, sz);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx, sy, sz);
+  }
+
+  // decode 3D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx, sy, sz);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx, sy, sz);
+  }
+
+protected:
+  using zfp_base<3, Scalar>::clone_stream;
+  using zfp_base<3, Scalar>::encode_block;
+  using zfp_base<3, Scalar>::decode_block;
+  using zfp_base<3, Scalar>::stream;
+  using zfp_base<3, Scalar>::thread_safety;
+
+  // encode 3D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy, sz);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 3D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, sx, sy, sz);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy, sz);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+// 4D codec
+template <typename Scalar>
+class zfp4 : public zfp_base<4, Scalar> {
+public:
+  // encode contiguous 4D block
+  size_t encode_block(bitstream_offset offset, uint shape, const Scalar* block) const
+  {
+    return shape ? encode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : encode_block(offset, block);
+  }
+
+  // decode contiguous 4D block
+  size_t decode_block(bitstream_offset offset, uint shape, Scalar* block) const
+  {
+    return shape ? decode_block_strided(offset, shape, block, 1, 4, 16, 64)
+                 : decode_block(offset, block);
+  }
+
+  // encode 4D block from strided storage
+  size_t encode_block_strided(bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = encode_block_strided(&zfp, offset, shape, p, sx, sy, sz, sw);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return encode_block_strided(stream, offset, shape, p, sx, sy, sz, sw);
+  }
+
+  // decode 4D block to strided storage
+  size_t decode_block_strided(bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    if (thread_safety) {
+      // thread-safe implementation
+      zfp_stream zfp = clone_stream();
+      size_t size = decode_block_strided(&zfp, offset, shape, p, sx, sy, sz, sw);
+      stream_close(zfp.stream);
+      return size;
+    }
+    else
+      return decode_block_strided(stream, offset, shape, p, sx, sy, sz, sw);
+  }
+
+protected:
+  using zfp_base<4, Scalar>::clone_stream;
+  using zfp_base<4, Scalar>::encode_block;
+  using zfp_base<4, Scalar>::decode_block;
+  using zfp_base<4, Scalar>::stream;
+  using zfp_base<4, Scalar>::thread_safety;
+
+  // encode 4D block from strided storage
+  static size_t encode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+  {
+    size_t size;
+    stream_wseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      uint nw = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::encode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw);
+    }
+    else
+      size = zfp::encode_block_strided<Scalar>(zfp, p, sx, sy, sz, sw);
+    stream_flush(zfp->stream);
+    return size;
+  }
+
+  // decode 4D block to strided storage
+  static size_t decode_block_strided(zfp_stream* zfp, bitstream_offset offset, uint shape, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+  {
+    size_t size;
+    stream_rseek(zfp->stream, offset);
+    if (shape) {
+      uint nx = 4 - (shape & 3u); shape >>= 2;
+      uint ny = 4 - (shape & 3u); shape >>= 2;
+      uint nz = 4 - (shape & 3u); shape >>= 2;
+      uint nw = 4 - (shape & 3u); shape >>= 2;
+      size = zfp::decode_partial_block_strided<Scalar>(zfp, p, nx, ny, nz, nw, sx, sy, sz, sw);
+    }
+    else
+      size = zfp::decode_block_strided<Scalar>(zfp, p, sx, sy, sz, sw);
+    stream_align(zfp->stream);
+    return size;
+  }
+};
+
+} // codec
+} // zfp
+
+#endif
diff --git a/include/zfp/constarray1.hpp b/include/zfp/constarray1.hpp
new file mode 100644
index 00000000..f2f501de
--- /dev/null
+++ b/include/zfp/constarray1.hpp
@@ -0,0 +1,265 @@
+#ifndef ZFP_CONSTARRAY1_HPP
+#define ZFP_CONSTARRAY1_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache1.hpp"
+#include "zfp/internal/array/handle1.hpp"
+#include "zfp/internal/array/iterator1.hpp"
+#include "zfp/internal/array/pointer1.hpp"
+#include "zfp/internal/array/reference1.hpp"
+#include "zfp/internal/array/store1.hpp"
+#include "zfp/internal/array/view1.hpp"
+
+namespace zfp {
+
+// compressed 1D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp1<Scalar>,
+  class Index = zfp::index::hybrid4
+>
+class const_array1 : public array {
+public:
+  // types utilized by nested classes
+  typedef const_array1 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore1<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache1<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim1::const_reference<const_array1> const_reference;
+  typedef zfp::internal::dim1::const_pointer<const_array1> const_pointer;
+  typedef zfp::internal::dim1::const_iterator<const_array1> const_iterator;
+  typedef zfp::internal::dim1::const_view<const_array1> const_view;
+  typedef zfp::internal::dim1::private_const_view<const_array1> private_const_view;
+
+  // default constructor
+  const_array1() :
+    array(1, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx-element array using given configuration, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  const_array1(size_t nx, const zfp_config& config, const value_type* p = 0, size_t cache_size = 0) :
+    array(1, Codec::type),
+    store(nx, config),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  const_array1(const const_array1& a) :
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // virtual destructor
+  virtual ~const_array1() {}
+
+  // assignment operator--performs a deep copy
+  const_array1& operator=(const const_array1& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    store.resize(nx, clear);
+  }
+
+  // compression mode
+  zfp_mode mode() const { return store.mode(); }
+
+  // rate in compressed bits per value (fixed-rate mode only)
+  double rate() const { return store.rate(); }
+
+  // precision in uncompressed bits per value (fixed-precision mode only)
+  uint precision() const { return store.precision(); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return store.accuracy(); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
+  // set rate in compressed bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, false);
+  }
+
+  // set precision in uncompressed bits per value
+  uint set_precision(uint precision)
+  {
+    cache.clear();
+    return store.set_precision(precision);
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance)
+  {
+    cache.clear();
+    return store.set_accuracy(tolerance);
+  }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    cache.clear();
+    store.set_reversible();
+  }
+
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  {
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
+  // set compression mode and parameters
+  void set_config(const zfp_config& config)
+  {
+    cache.clear();
+    store.set_config(config);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const ptrdiff_t sx = 1;
+    size_t block_index = 0;
+    for (size_t i = 0; i < bx; i++, p += 4)
+      cache.get_block(block_index++, p, sx);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p, bool compact = true)
+  {
+    cache.clear();
+    store.clear();
+    const size_t bx = store.block_size_x();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      for (size_t i = 0; i < bx; i++, p += 4)
+        store.encode(block_index++, p, sx);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4] = {};
+      while (block_index < bx)
+        store.encode(block_index++, block);
+    }
+    store.flush();
+    if (compact)
+      store.compact();
+  }
+
+  // accessor
+  const_reference operator()(size_t i) const { return const_reference(const_cast<container_type*>(this), i); }
+
+  // flat index accessor
+  const_reference operator[](size_t index) const { return const_reference(const_cast<container_type*>(this), index); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0); }
+  const_iterator cend() const { return const_iterator(this, nx); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim1::const_handle<const_array1>;
+  friend class zfp::internal::dim1::const_reference<const_array1>;
+  friend class zfp::internal::dim1::const_pointer<const_array1>;
+  friend class zfp::internal::dim1::const_iterator<const_array1>;
+  friend class zfp::internal::dim1::const_view<const_array1>;
+  friend class zfp::internal::dim1::private_const_view<const_array1>;
+
+  // perform a deep copy
+  void deep_copy(const const_array1& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+
+  // inspector
+  value_type get(size_t i) const { return cache.get(i); }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef const_array1<float> const_array1f;
+typedef const_array1<double> const_array1d;
+
+}
+
+#endif
diff --git a/include/zfp/constarray2.hpp b/include/zfp/constarray2.hpp
new file mode 100644
index 00000000..e8928629
--- /dev/null
+++ b/include/zfp/constarray2.hpp
@@ -0,0 +1,288 @@
+#ifndef ZFP_CONSTARRAY2_HPP
+#define ZFP_CONSTARRAY2_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache2.hpp"
+#include "zfp/internal/array/handle2.hpp"
+#include "zfp/internal/array/iterator2.hpp"
+#include "zfp/internal/array/pointer2.hpp"
+#include "zfp/internal/array/reference2.hpp"
+#include "zfp/internal/array/store2.hpp"
+#include "zfp/internal/array/view2.hpp"
+
+namespace zfp {
+
+// compressed 2D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp2<Scalar>,
+  class Index = zfp::index::hybrid4
+>
+class const_array2 : public array {
+public:
+  // types utilized by nested classes
+  typedef const_array2 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore2<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache2<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim2::const_reference<const_array2> const_reference;
+  typedef zfp::internal::dim2::const_pointer<const_array2> const_pointer;
+  typedef zfp::internal::dim2::const_iterator<const_array2> const_iterator;
+  typedef zfp::internal::dim2::const_view<const_array2> const_view;
+  typedef zfp::internal::dim2::private_const_view<const_array2> private_const_view;
+
+  // default constructor
+  const_array2() :
+    array(2, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx * ny array using given configuration, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  const_array2(size_t nx, size_t ny, const zfp_config& config, const value_type* p = 0, size_t cache_size = 0) :
+    array(2, Codec::type),
+    store(nx, ny, config),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    this->ny = ny;
+    set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  const_array2(const const_array2& a) :
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // virtual destructor
+  virtual ~const_array2() {}
+
+  // assignment operator--performs a deep copy
+  const_array2& operator=(const const_array2& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx * ny; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, size_t ny, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    this->ny = ny;
+    store.resize(nx, ny, clear);
+  }
+
+  // compression mode
+  zfp_mode mode() const { return store.mode(); }
+
+  // rate in compressed bits per value (fixed-rate mode only)
+  double rate() const { return store.rate(); }
+
+  // precision in uncompressed bits per value (fixed-precision mode only)
+  uint precision() const { return store.precision(); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return store.accuracy(); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
+  // set rate in compressed bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, false);
+  }
+
+  // set precision in uncompressed bits per value
+  uint set_precision(uint precision)
+  {
+    cache.clear();
+    return store.set_precision(precision);
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance)
+  {
+    cache.clear();
+    return store.set_accuracy(tolerance);
+  }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    cache.clear();
+    store.set_reversible();
+  }
+
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  { 
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
+  // set compression mode and parameters
+  void set_config(const zfp_config& config)
+  {
+    cache.clear();
+    store.set_config(config);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const ptrdiff_t sx = 1;
+    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+    size_t block_index = 0;
+    for (size_t j = 0; j < by; j++, p += 4 * sx * (nx - bx))
+      for (size_t i = 0; i < bx; i++, p += 4)
+        cache.get_block(block_index++, p, sx, sy);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p, bool compact = true)
+  {
+    cache.clear();
+    store.clear();
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      for (size_t j = 0; j < by; j++, p += 4 * sx * (nx - bx))
+        for (size_t i = 0; i < bx; i++, p += 4)
+          store.encode(block_index++, p, sx, sy);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4] = {};
+      while (block_index < bx * by)
+        store.encode(block_index++, block);
+    }
+    store.flush();
+    if (compact)
+      store.compact();
+  }
+
+  // (i, j) accessor
+  const_reference operator()(size_t i, size_t j) const { return const_reference(const_cast<container_type*>(this), i, j); }
+
+  // flat index accessor
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j;
+    ij(i, j, index);
+    return const_reference(const_cast<container_type*>(this), i, j);
+  }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0, 0); }
+  const_iterator cend() const { return const_iterator(this, 0, ny); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<const_array2>;
+  friend class zfp::internal::dim2::const_reference<const_array2>;
+  friend class zfp::internal::dim2::const_pointer<const_array2>;
+  friend class zfp::internal::dim2::const_iterator<const_array2>;
+  friend class zfp::internal::dim2::const_view<const_array2>;
+  friend class zfp::internal::dim2::private_const_view<const_array2>;
+
+  // perform a deep copy
+  void deep_copy(const const_array2& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+  size_t min_y() const { return 0; }
+  size_t max_y() const { return ny; }
+
+  // inspector
+  value_type get(size_t i, size_t j) const { return cache.get(i, j); }
+
+  // convert flat index to (i, j)
+  void ij(size_t& i, size_t& j, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny;
+  }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef const_array2<float> const_array2f;
+typedef const_array2<double> const_array2d;
+
+}
+
+#endif
diff --git a/include/zfp/constarray3.hpp b/include/zfp/constarray3.hpp
new file mode 100644
index 00000000..61d65d46
--- /dev/null
+++ b/include/zfp/constarray3.hpp
@@ -0,0 +1,300 @@
+#ifndef ZFP_CONSTARRAY3_HPP
+#define ZFP_CONSTARRAY3_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache3.hpp"
+#include "zfp/internal/array/handle3.hpp"
+#include "zfp/internal/array/iterator3.hpp"
+#include "zfp/internal/array/pointer3.hpp"
+#include "zfp/internal/array/reference3.hpp"
+#include "zfp/internal/array/store3.hpp"
+#include "zfp/internal/array/view3.hpp"
+
+namespace zfp {
+
+// compressed 3D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp3<Scalar>,
+  class Index = zfp::index::hybrid4
+>
+class const_array3 : public array {
+public:
+  // types utilized by nested classes
+  typedef const_array3 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore3<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache3<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim3::const_reference<const_array3> const_reference;
+  typedef zfp::internal::dim3::const_pointer<const_array3> const_pointer;
+  typedef zfp::internal::dim3::const_iterator<const_array3> const_iterator;
+  typedef zfp::internal::dim3::const_view<const_array3> const_view;
+  typedef zfp::internal::dim3::private_const_view<const_array3> private_const_view;
+
+  // default constructor
+  const_array3() :
+    array(3, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx * ny * nz array using given configuration, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  const_array3(size_t nx, size_t ny, size_t nz, const zfp_config& config, const value_type* p = 0, size_t cache_size = 0) :
+    array(3, Codec::type),
+    store(nx, ny, nz, config),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  const_array3(const const_array3& a) :
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // virtual destructor
+  virtual ~const_array3() {}
+
+  // assignment operator--performs a deep copy
+  const_array3& operator=(const const_array3& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx * ny * nz; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, size_t ny, size_t nz, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    store.resize(nx, ny, nz, clear);
+  }
+
+  // compression mode
+  zfp_mode mode() const { return store.mode(); }
+
+  // rate in compressed bits per value (fixed-rate mode only)
+  double rate() const { return store.rate(); }
+
+  // precision in uncompressed bits per value (fixed-precision mode only)
+  uint precision() const { return store.precision(); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return store.accuracy(); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
+  // set rate in compressed bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, false);
+  }
+
+  // set precision in uncompressed bits per value
+  uint set_precision(uint precision)
+  {
+    cache.clear();
+    return store.set_precision(precision);
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance)
+  {
+    cache.clear();
+    return store.set_accuracy(tolerance);
+  }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    cache.clear();
+    store.set_reversible();
+  }
+
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  { 
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
+  // set compression mode and parameters
+  void set_config(const zfp_config& config)
+  {
+    cache.clear();
+    store.set_config(config);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    const ptrdiff_t sx = 1;
+    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+    const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+    size_t block_index = 0;
+    for (size_t k = 0; k < bz; k++, p += 4 * sy * (ny - by))
+      for (size_t j = 0; j < by; j++, p += 4 * sx * (nx - bx))
+        for (size_t i = 0; i < bx; i++, p += 4)
+          cache.get_block(block_index++, p, sx, sy, sz);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p, bool compact = true)
+  {
+    cache.clear();
+    store.clear();
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+      for (size_t k = 0; k < bz; k++, p += 4 * sy * (ny - by))
+        for (size_t j = 0; j < by; j++, p += 4 * sx * (nx - bx))
+          for (size_t i = 0; i < bx; i++, p += 4)
+            store.encode(block_index++, p, sx, sy, sz);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4 * 4] = {};
+      while (block_index < bx * by * bz)
+        store.encode(block_index++, block);
+    }
+    store.flush();
+    if (compact)
+      store.compact();
+  }
+
+  // (i, j, k) accessor
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(const_cast<container_type*>(this), i, j, k); }
+
+  // flat index accessor
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j, k;
+    ijk(i, j, k, index);
+    return const_reference(const_cast<container_type*>(this), i, j, k);
+  }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0, 0, 0); }
+  const_iterator cend() const { return const_iterator(this, 0, 0, nz); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<const_array3>;
+  friend class zfp::internal::dim3::const_reference<const_array3>;
+  friend class zfp::internal::dim3::const_pointer<const_array3>;
+  friend class zfp::internal::dim3::const_iterator<const_array3>;
+  friend class zfp::internal::dim3::const_view<const_array3>;
+  friend class zfp::internal::dim3::private_const_view<const_array3>;
+
+  // perform a deep copy
+  void deep_copy(const const_array3& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+  size_t min_y() const { return 0; }
+  size_t max_y() const { return ny; }
+  size_t min_z() const { return 0; }
+  size_t max_z() const { return nz; }
+
+  // inspector
+  value_type get(size_t i, size_t j, size_t k) const { return cache.get(i, j, k); }
+
+  // convert flat index to (i, j, k)
+  void ijk(size_t& i, size_t& j, size_t& k, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index;
+  }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef const_array3<float> const_array3f;
+typedef const_array3<double> const_array3d;
+
+}
+
+#endif
diff --git a/include/zfp/constarray4.hpp b/include/zfp/constarray4.hpp
new file mode 100644
index 00000000..63680f16
--- /dev/null
+++ b/include/zfp/constarray4.hpp
@@ -0,0 +1,312 @@
+#ifndef ZFP_CONSTARRAY4_HPP
+#define ZFP_CONSTARRAY4_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include "zfp/array.hpp"
+#include "zfp/index.hpp"
+#include "zfp/codec/zfpcodec.hpp"
+#include "zfp/internal/array/cache4.hpp"
+#include "zfp/internal/array/handle4.hpp"
+#include "zfp/internal/array/iterator4.hpp"
+#include "zfp/internal/array/pointer4.hpp"
+#include "zfp/internal/array/reference4.hpp"
+#include "zfp/internal/array/store4.hpp"
+#include "zfp/internal/array/view4.hpp"
+
+namespace zfp {
+
+// compressed 4D array of scalars
+template <
+  typename Scalar,
+  class Codec = zfp::codec::zfp4<Scalar>,
+  class Index = zfp::index::hybrid4
+>
+class const_array4 : public array {
+public:
+  // types utilized by nested classes
+  typedef const_array4 container_type;
+  typedef Scalar value_type;
+  typedef Codec codec_type;
+  typedef Index index_type;
+  typedef zfp::internal::BlockStore4<value_type, codec_type, index_type> store_type;
+  typedef zfp::internal::BlockCache4<value_type, store_type> cache_type;
+  typedef typename Codec::header header;
+
+  // accessor classes
+  typedef zfp::internal::dim4::const_reference<const_array4> const_reference;
+  typedef zfp::internal::dim4::const_pointer<const_array4> const_pointer;
+  typedef zfp::internal::dim4::const_iterator<const_array4> const_iterator;
+  typedef zfp::internal::dim4::const_view<const_array4> const_view;
+  typedef zfp::internal::dim4::private_const_view<const_array4> private_const_view;
+
+  // default constructor
+  const_array4() :
+    array(4, Codec::type),
+    cache(store)
+  {}
+
+  // constructor of nx * ny * nz * nw array using given configuration, at least
+  // cache_size bytes of cache, and optionally initialized from flat array p
+  const_array4(size_t nx, size_t ny, size_t nz, size_t nw, const zfp_config& config, const value_type* p = 0, size_t cache_size = 0) :
+    array(4, Codec::type),
+    store(nx, ny, nz, nw, config),
+    cache(store, cache_size)
+  {
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    this->nw = nw;
+    set(p);
+  }
+
+  // copy constructor--performs a deep copy
+  const_array4(const const_array4& a) :
+    cache(store)
+  {
+    deep_copy(a);
+  }
+
+  // virtual destructor
+  virtual ~const_array4() {}
+
+  // assignment operator--performs a deep copy
+  const_array4& operator=(const const_array4& a)
+  {
+    if (this != &a)
+      deep_copy(a);
+    return *this;
+  }
+
+  // total number of elements in array
+  size_t size() const { return nx * ny * nz * nw; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+  size_t size_w() const { return nw; }
+
+  // resize the array (all previously stored data will be lost)
+  void resize(size_t nx, size_t ny, size_t nz, size_t nw, bool clear = true)
+  {
+    cache.clear();
+    this->nx = nx;
+    this->ny = ny;
+    this->nz = nz;
+    this->nw = nw;
+    store.resize(nx, ny, nz, nw, clear);
+  }
+
+  // compression mode
+  zfp_mode mode() const { return store.mode(); }
+
+  // rate in compressed bits per value (fixed-rate mode only)
+  double rate() const { return store.rate(); }
+
+  // precision in uncompressed bits per value (fixed-precision mode only)
+  uint precision() const { return store.precision(); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return store.accuracy(); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { return store.params(minbits, maxbits, maxprec, minexp); }
+
+  // set rate in compressed bits per value
+  double set_rate(double rate)
+  {
+    cache.clear();
+    return store.set_rate(rate, false);
+  }
+
+  // set precision in uncompressed bits per value
+  uint set_precision(uint precision)
+  {
+    cache.clear();
+    return store.set_precision(precision);
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance)
+  {
+    cache.clear();
+    return store.set_accuracy(tolerance);
+  }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    cache.clear();
+    store.set_reversible();
+  }
+
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  { 
+    cache.clear();
+    return store.set_params(minbits, maxbits, maxprec, minexp);
+  }
+
+  // set compression mode and parameters
+  void set_config(const zfp_config& config)
+  {
+    cache.clear();
+    store.set_config(config);
+  }
+
+  // byte size of array data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += store.size_bytes(mask);
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return store.compressed_size(); }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const
+  {
+    cache.flush();
+    return store.compressed_data();
+  }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes)
+  {
+    cache.flush();
+    cache.resize(bytes);
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // decompress array and store at p
+  void get(value_type* p) const
+  {
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    const size_t bw = store.block_size_w();
+    const ptrdiff_t sx = 1;
+    const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+    const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+    const ptrdiff_t sw = static_cast<ptrdiff_t>(nx * ny * nz);
+    size_t block_index = 0;
+    for (size_t l = 0; l < bw; l++, p += 4 * sz * (nz - bz))
+      for (size_t k = 0; k < bz; k++, p += 4 * sy * (ny - by))
+        for (size_t j = 0; j < by; j++, p += 4 * sx * (nx - bx))
+          for (size_t i = 0; i < bx; i++, p += 4)
+            cache.get_block(block_index++, p, sx, sy, sz, sw);
+  }
+
+  // initialize array by copying and compressing data stored at p
+  void set(const value_type* p, bool compact = true)
+  {
+    cache.clear();
+    store.clear();
+    const size_t bx = store.block_size_x();
+    const size_t by = store.block_size_y();
+    const size_t bz = store.block_size_z();
+    const size_t bw = store.block_size_w();
+    size_t block_index = 0;
+    if (p) {
+      // compress data stored at p
+      const ptrdiff_t sx = 1;
+      const ptrdiff_t sy = static_cast<ptrdiff_t>(nx);
+      const ptrdiff_t sz = static_cast<ptrdiff_t>(nx * ny);
+      const ptrdiff_t sw = static_cast<ptrdiff_t>(nx * ny * nz);
+      for (size_t l = 0; l < bw; l++, p += 4 * sz * (nz - bz))
+        for (size_t k = 0; k < bz; k++, p += 4 * sy * (ny - by))
+          for (size_t j = 0; j < by; j++, p += 4 * sx * (nx - bx))
+            for (size_t i = 0; i < bx; i++, p += 4)
+              store.encode(block_index++, p, sx, sy, sz, sw);
+    }
+    else {
+      // zero-initialize array
+      const value_type block[4 * 4 * 4 * 4] = {};
+      while (block_index < bx * by * bz * bw)
+        store.encode(block_index++, block);
+    }
+    store.flush();
+    if (compact)
+      store.compact();
+  }
+
+  // (i, j, k, l) accessor
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(const_cast<container_type*>(this), i, j, k, l); }
+
+  // flat index accessor
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j, k, l;
+    ijkl(i, j, k, l, index);
+    return const_reference(const_cast<container_type*>(this), i, j, k, l);
+  }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, 0, 0, 0, 0); }
+  const_iterator cend() const { return const_iterator(this, 0, 0, 0, nw); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<const_array4>;
+  friend class zfp::internal::dim4::const_reference<const_array4>;
+  friend class zfp::internal::dim4::const_pointer<const_array4>;
+  friend class zfp::internal::dim4::const_iterator<const_array4>;
+  friend class zfp::internal::dim4::const_view<const_array4>;
+  friend class zfp::internal::dim4::private_const_view<const_array4>;
+
+  // perform a deep copy
+  void deep_copy(const const_array4& a)
+  {
+    // copy base class members
+    array::deep_copy(a);
+    // copy persistent storage
+    store.deep_copy(a.store);
+    // copy cached data
+    cache.deep_copy(a.cache);
+  }
+
+  // global index bounds
+  size_t min_x() const { return 0; }
+  size_t max_x() const { return nx; }
+  size_t min_y() const { return 0; }
+  size_t max_y() const { return ny; }
+  size_t min_z() const { return 0; }
+  size_t max_z() const { return nz; }
+  size_t min_w() const { return 0; }
+  size_t max_w() const { return nw; }
+
+  // inspector
+  value_type get(size_t i, size_t j, size_t k, size_t l) const { return cache.get(i, j, k, l); }
+
+  // convert flat index to (i, j, k, l)
+  void ijkl(size_t& i, size_t& j, size_t& k, size_t& l, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index % nz; index /= nz;
+    l = index;
+  }
+
+  store_type store; // persistent storage of compressed blocks
+  cache_type cache; // cache of decompressed blocks
+};
+
+typedef const_array4<float> const_array4f;
+typedef const_array4<double> const_array4d;
+
+}
+
+#endif
diff --git a/include/zfp/factory.hpp b/include/zfp/factory.hpp
new file mode 100644
index 00000000..73091514
--- /dev/null
+++ b/include/zfp/factory.hpp
@@ -0,0 +1,119 @@
+#ifndef ZFP_FACTORY_HPP
+#define ZFP_FACTORY_HPP
+
+// ensure zfp/array.hpp has already been included
+#ifndef ZFP_ARRAY_HPP
+  #error "zfp/array.hpp must be included before zfp/factory.hpp"
+#endif
+
+zfp::array* zfp::array::construct(const zfp::array::header& header, const void* buffer, size_t buffer_size_bytes)
+{
+  // extract metadata from header
+  const zfp_type type = header.scalar_type();
+  const double rate = header.rate();
+  const uint dims = header.dimensionality();
+  const size_t nx = header.size_x();
+  const size_t ny = header.size_y();
+  const size_t nz = header.size_z();
+  const size_t nw = header.size_w();
+
+  // construct once (passing zfp::array::header will read it again)
+  zfp::array* arr = 0;
+  std::string error;
+  switch (dims) {
+    case 4:
+#ifdef ZFP_ARRAY4_HPP
+      switch (type) {
+        case zfp_type_float:
+          arr = new zfp::array4f(nx, ny, nz, nw, rate);
+          break;
+        case zfp_type_double:
+          arr = new zfp::array4d(nx, ny, nz, nw, rate);
+          break;
+        default:
+          /* NOTREACHED */
+          error = "zfp scalar type not supported";
+          break;
+      }
+#else
+      error = "array4 not supported; include zfp/array4.hpp before zfp/factory.hpp";
+#endif
+      break;
+
+    case 3:
+#ifdef ZFP_ARRAY3_HPP
+      switch (type) {
+        case zfp_type_float:
+          arr = new zfp::array3f(nx, ny, nz, rate);
+          break;
+        case zfp_type_double:
+          arr = new zfp::array3d(nx, ny, nz, rate);
+          break;
+        default:
+          /* NOTREACHED */
+          error = "zfp scalar type not supported";
+          break;
+      }
+#else
+      error = "array3 not supported; include zfp/array3.hpp before zfp/factory.hpp";
+#endif
+      break;
+
+    case 2:
+#ifdef ZFP_ARRAY2_HPP
+      switch (type) {
+        case zfp_type_float:
+          arr = new zfp::array2f(nx, ny, rate);
+          break;
+        case zfp_type_double:
+          arr = new zfp::array2d(nx, ny, rate);
+          break;
+        default:
+          /* NOTREACHED */
+          error = "zfp scalar type not supported";
+          break;
+      }
+#else
+      error = "array2 not supported; include zfp/array2.hpp before zfp/factory.hpp";
+#endif
+      break;
+
+    case 1:
+#ifdef ZFP_ARRAY1_HPP
+      switch (type) {
+        case zfp_type_float:
+          arr = new zfp::array1f(nx, rate);
+          break;
+        case zfp_type_double:
+          arr = new zfp::array1d(nx, rate);
+          break;
+        default:
+          /* NOTREACHED */
+          error = "zfp scalar type not supported";
+          break;
+      }
+#else
+      error = "array1 not supported; include zfp/array1.hpp before zfp/factory.hpp";
+#endif
+      break;
+
+    default:
+      error = "zfp array dimensionality other than {1, 2, 3, 4} not supported";
+      break;
+  }
+
+  if (!error.empty())
+    throw zfp::exception(error);
+
+  if (buffer) {
+    if (buffer_size_bytes && buffer_size_bytes < arr->compressed_size()) {
+      delete arr;
+      throw zfp::exception("zfp buffer size is smaller than required");
+    }
+    std::memcpy(arr->compressed_data(), buffer, arr->compressed_size());
+  }
+
+  return arr;
+}
+
+#endif
diff --git a/include/zfp/index.hpp b/include/zfp/index.hpp
new file mode 100644
index 00000000..b84e9b75
--- /dev/null
+++ b/include/zfp/index.hpp
@@ -0,0 +1,537 @@
+#ifndef ZFP_INDEX_HPP
+#define ZFP_INDEX_HPP
+
+#include <algorithm>
+#include "zfp/internal/array/memory.hpp"
+
+namespace zfp {
+namespace index {
+
+// implicit block index (fixed-size blocks; 0 bits/block; 64-bit offsets) -----
+class implicit {
+public:
+  // constructor
+  implicit(size_t blocks) :
+    bits_per_block(0)
+  {
+    resize(blocks);
+  }
+
+  // destructor
+  ~implicit() {}
+
+  // byte size of index data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // range of offsets spanned by indexed data in bits
+  bitstream_size range() const { return block_offset(blocks); }
+
+  // bit size of given block
+  size_t block_size(size_t /*block_index*/) const { return bits_per_block; }
+
+  // bit offset of given block
+  bitstream_offset block_offset(size_t block_index) const { return block_index * bits_per_block; }
+
+  // reset index
+  void clear() {}
+
+  // resize index in number of blocks
+  void resize(size_t blocks) { this->blocks = blocks; }
+
+  // flush any buffered data
+  void flush() {}
+
+  // set bit size of all blocks
+  void set_block_size(size_t size) { bits_per_block = size; }
+
+  // set bit size of given block (ignored for performance reasons)
+  void set_block_size(size_t /*block_index*/, size_t /*size*/) {}
+
+  // does not support variable rate
+  static bool has_variable_rate() { return false; }
+
+protected:
+  size_t blocks;         // number of blocks
+  size_t bits_per_block; // fixed number of bits per block
+};
+
+// verbatim block index (64 bits/block; 64-bit offsets) -----------------------
+class verbatim {
+public:
+  // constructor for given nbumber of blocks
+  verbatim(size_t blocks) :
+    data(0)
+  {
+    resize(blocks);
+  }
+
+  // destructor
+  ~verbatim() { zfp::internal::deallocate(data); }
+
+  // assignment operator--performs a deep copy
+  verbatim& operator=(const verbatim& index)
+  {
+    if (this != &index)
+      deep_copy(index);
+    return *this;
+  }
+
+  // byte size of index data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_INDEX)
+      size += capacity() * sizeof(*data);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // range of offsets spanned by indexed data in bits
+  bitstream_size range() const { return block_offset(blocks); }
+
+  // bit size of given block
+  size_t block_size(size_t block_index) const { return static_cast<size_t>(block_offset(block_index + 1) - block_offset(block_index)); }
+
+  // bit offset of given block
+  bitstream_offset block_offset(size_t block_index) const { return static_cast<bitstream_offset>(data[block_index]); }
+
+  // reset index
+  void clear() { block = 0; }
+
+  // resize index in number of blocks
+  void resize(size_t blocks)
+  {
+    this->blocks = blocks;
+    zfp::internal::reallocate(data, capacity() * sizeof(*data));
+    *data = 0;
+    clear();
+  }
+
+  // flush any buffered data
+  void flush() {}
+
+  // set bit size of all blocks
+  void set_block_size(size_t size)
+  {
+    clear();
+    while (block < blocks)
+      set_block_size(block, size);
+    clear();
+  }
+
+  // set bit size of given block (in sequential order)
+  void set_block_size(size_t block_index, size_t size)
+  {
+    if (block_index != block)
+      throw zfp::exception("zfp index supports only sequential build");
+    if (block == blocks)
+      throw zfp::exception("zfp index overflow");
+    data[block + 1] = data[block] + size;
+    block++;
+  }
+
+  // supports variable rate
+  static bool has_variable_rate() { return true; }
+
+protected:
+  // capacity of data array
+  size_t capacity() const { return blocks + 1; }
+
+  // make a deep copy of index
+  void deep_copy(const verbatim& index)
+  {
+    zfp::internal::clone(data, index.data, index.capacity());
+    blocks = index.blocks;
+    block = index.block;
+  }
+
+  uint64* data;  // block offset array
+  size_t blocks; // number of blocks
+  size_t block;  // current block index
+};
+
+// hybrid block index (4 blocks/chunk; 24 bits/block; 44-bit offsets) ---------
+class hybrid4 {
+public:
+  // constructor for given number of blocks
+  hybrid4(size_t blocks) :
+    data(0)
+  {
+    resize(blocks);
+  }
+
+  // destructor
+  ~hybrid4() { zfp::internal::deallocate(data); }
+
+  // assignment operator--performs a deep copy
+  hybrid4& operator=(const hybrid4& index)
+  {
+    if (this != &index)
+      deep_copy(index);
+    return *this;
+  }
+
+  // byte size of index data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_INDEX)
+      size += capacity() * sizeof(*data);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // range of offsets spanned by indexed data in bits
+  bitstream_size range() const { return end; }
+
+  // bit size of given block
+  size_t block_size(size_t block_index) const
+  {
+    size_t chunk = block_index / 4;
+    size_t which = block_index % 4;
+    return which == 3u
+             ? static_cast<size_t>(block_offset(block_index + 1) - block_offset(block_index))
+             : static_cast<size_t>(data[chunk].lo[which + 1] - data[chunk].lo[which]);
+  }
+
+  // bit offset of given block
+  bitstream_offset block_offset(size_t block_index) const
+  {
+    // if index is being built, point offset to end
+    if (block_index == block)
+      return end;
+    // index has already been built; decode offset
+    size_t chunk = block_index / 4;
+    size_t which = block_index % 4;
+    return (bitstream_offset(data[chunk].hi) << shift) + data[chunk].lo[which];
+  }
+
+  // reset index
+  void clear()
+  {
+    block = 0;
+    ptr = 0;
+    end = 0;
+  }
+
+  void resize(size_t blocks)
+  {
+    this->blocks = blocks;
+    zfp::internal::reallocate(data, capacity() * sizeof(*data));
+    clear();
+  }
+
+  // flush any buffered data
+  void flush()
+  {
+    while (block & 0x3u)
+      set_block_size(block, 0);
+  }
+
+  // set bit size of all blocks
+  void set_block_size(size_t size)
+  {
+    clear();
+    while (block < blocks)
+      set_block_size(block, size);
+    flush();
+    clear();
+  }
+
+  // set bit size of given block (in sequential order)
+  void set_block_size(size_t block_index, size_t size)
+  {
+    // ensure block_index is next in sequence
+    if (block_index != block)
+      throw zfp::exception("zfp index supports only sequential build");
+    // ensure block index is within bounds, but allow 0-size blocks for padding 
+    if (block >= blocks && size)
+      throw zfp::exception("zfp index overflow");
+    // ensure block size is valid
+    if (size > ZFP_MAX_BITS)
+      throw zfp::exception("zfp block size is too large for hybrid4 index");
+    // advance end pointer
+    end += size;
+    // buffer chunk of 4 block sizes at a time
+    size_t chunk = block / 4;
+    size_t which = block % 4;
+    buffer[which] = size;
+    if (which == 3u) {
+      // chunk is complete; encode it
+      if (ptr >> (32 + shift))
+        throw zfp::exception("zfp block offset is too large for hybrid4 index");
+      // store high bits
+      data[chunk].hi = static_cast<uint32>(ptr >> shift);
+      bitstream_offset base = bitstream_offset(data[chunk].hi) << shift;
+      // store low bits
+      for (uint k = 0; k < 4; k++) {
+        data[chunk].lo[k] = static_cast<uint16>(ptr - base);
+        ptr += buffer[k];
+      }
+    }
+    block++;
+  }
+
+  // supports variable rate
+  static bool has_variable_rate() { return true; }
+
+protected:
+  // chunk record encoding 4 block offsets
+  typedef struct {
+    uint32 hi;    // 32 most significant bits of 44-bit base offset
+    uint16 lo[4]; // 16-bit offsets from base
+  } record;
+
+  // capacity of data array
+  size_t capacity() const { return (blocks + 3) / 4; }
+
+  // make a deep copy of index
+  void deep_copy(const hybrid4& index)
+  {
+    zfp::internal::clone(data, index.data, index.capacity());
+    blocks = index.blocks;
+    block = index.block;
+    ptr = index.ptr;
+    end = index.end;
+    std::copy(index.buffer, index.buffer + 4, buffer);
+  }
+
+  static const uint shift = 12; // number of bits to shift hi bits
+
+  record* data;         // block offset array
+  size_t blocks;        // number of blocks
+  size_t block;         // current block index
+  bitstream_offset end; // offset to last block
+  bitstream_offset ptr; // offset to current chunk of blocks
+  size_t buffer[4];     // bit sizes 4 blocks to be stored together
+};
+
+// hybrid block index (8 blocks/chunk; 16 bits/block; 86-14dims bit offsets) --
+template <uint dims>
+class hybrid8 {
+public:
+  // constructor for given number of blocks
+  hybrid8(size_t blocks) :
+    data(0)
+  {
+    resize(blocks);
+  }
+
+  // destructor
+  ~hybrid8() { zfp::internal::deallocate(data); }
+
+  // assignment operator--performs a deep copy
+  hybrid8& operator=(const hybrid8& index)
+  {
+    if (this != &index)
+      deep_copy(index);
+    return *this;
+  }
+
+  // byte size of index data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_INDEX)
+      size += capacity() * sizeof(*data);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // range of offsets spanned by indexed data in bits
+  bitstream_size range() const { return end; }
+
+  // bit size of given block
+  size_t block_size(size_t block_index) const
+  {
+    size_t chunk = block_index / 8;
+    size_t which = block_index % 8;
+    return which == 7u
+             ? static_cast<size_t>(block_offset(block_index + 1) - block_offset(block_index))
+             : size(data[2 * chunk + 0], data[2 * chunk + 1], static_cast<uint>(which));
+  }
+
+  // bit offset of given block
+  bitstream_offset block_offset(size_t block_index) const
+  {
+    // if index is being built, point offset to end
+    if (block_index == block)
+      return end;
+    // index has already been built; decode offset
+    size_t chunk = block_index / 8;
+    size_t which = block_index % 8;
+    return offset(data[2 * chunk + 0], data[2 * chunk + 1], static_cast<uint>(which));
+  }
+
+  // reset index
+  void clear()
+  {
+    block = 0;
+    ptr = 0;
+    end = 0;
+  }
+
+  void resize(size_t blocks)
+  {
+    this->blocks = blocks;
+    zfp::internal::reallocate(data, capacity() * sizeof(*data));
+    clear();
+  }
+
+  // flush any buffered data
+  void flush()
+  {
+    while (block & 0x7u)
+      set_block_size(block, 0);
+  }
+
+  // set bit size of all blocks
+  void set_block_size(size_t size)
+  {
+    clear();
+    while (block < blocks)
+      set_block_size(block, size);
+    flush();
+    clear();
+  }
+
+  // set bit size of given block (in sequential order)
+  void set_block_size(size_t block_index, size_t size)
+  {
+    // ensure block_index is next in sequence
+    if (block_index != block)
+      throw zfp::exception("zfp index supports only sequential build");
+    // ensure block index is within bounds, but allow 0-size blocks for padding 
+    if (block >= blocks && size)
+      throw zfp::exception("zfp index overflow");
+    // ensure block size is valid
+    if (size >> (hbits + lbits))
+      throw zfp::exception("zfp block size is too large for hybrid8 index");
+    // advance end pointer
+    end += size;
+    // buffer chunk of 8 block sizes at a time
+    size_t chunk = block / 8;
+    size_t which = block % 8;
+    buffer[which] = size;
+    if (which == 7u) {
+      // partition chunk offset into low and high bits
+      uint64 h = ptr >> lbits;
+      uint64 l = ptr - (h << lbits);
+      uint64 hi = h << (7 * hbits);
+      uint64 lo = l << (7 * lbits);
+      // make sure base offset does not overflow
+      if ((hi >> (7 * hbits)) != h)
+        throw zfp::exception("zfp block offset is too large for hybrid8 index");
+      // store sizes of blocks 0-6
+      for (uint k = 0; k < 7; k++) {
+        size = buffer[k];
+        ptr += size;
+        // partition block size into hbits high and lbits low bits
+        h = size >> lbits;
+        l = size - (h << lbits);
+        hi += h << ((6 - k) * hbits);
+        lo += l << ((6 - k) * lbits);
+      }
+      ptr += buffer[7];
+      data[2 * chunk + 0] = hi;
+      data[2 * chunk + 1] = lo;
+    }
+    block++;
+  }
+
+  // supports variable rate
+  static bool has_variable_rate() { return true; }
+
+protected:
+  // capacity of data array
+  size_t capacity() const { return 2 * ((blocks + 7) / 8); }
+
+  // make a deep copy of index
+  void deep_copy(const hybrid8& index)
+  {
+    zfp::internal::clone(data, index.data, index.capacity());
+    blocks = index.blocks;
+    block = index.block;
+    ptr = index.ptr;
+    end = index.end;
+    std::copy(index.buffer, index.buffer + 8, buffer);
+  }
+
+  // kth size in chunk, 0 <= k <= 6
+  static size_t size(uint64 h, uint64 l, uint k)
+  {
+    // extract high and low bits
+    h >>= (6 - k) * hbits; h &= (UINT64C(1) << hbits) - 1;
+    l >>= (6 - k) * lbits; l &= (UINT64C(1) << lbits) - 1;
+    // combine base offset with high and low bits
+    return static_cast<size_t>((h << lbits) + l);
+  }
+
+  // kth offset in chunk, 0 <= k <= 7
+  static bitstream_offset offset(uint64 h, uint64 l, uint k)
+  {
+    // extract all but lowest (8 * hbits) bits
+    uint64 base = h >> (8 * hbits);
+    h -= base << (8 * hbits);
+    // add LSBs of base offset and k block sizes
+    h = hsum(h >> ((7 - k) * hbits));
+    l = lsum(l >> ((7 - k) * lbits));
+    // combine base offset with high and low bits
+    return static_cast<bitstream_offset>((((base << hbits) + h) << lbits) + l);
+  }
+
+  // sum of (up to) eight packed 8-bit numbers (efficient version of sum8)
+  static uint64 lsum(uint64 x)
+  {
+    // reduce in parallel
+    uint64 y = x & UINT64C(0xff00ff00ff00ff00);
+    x -= y;
+    x += y >> 8;
+    x += x >> 16;
+    x += x >> 32;
+    return x & UINT64C(0xffff);
+  }
+
+  // sum of (up to) eight packed h-bit numbers
+  static uint64 hsum(uint64 x) { return sum8(x, hbits); }
+
+  // compute sum of eight packed n-bit values (1 <= n <= 8)
+  static uint64 sum8(uint64 x, uint n)
+  {
+    // bit masks for extracting terms of sums
+    uint64 m3 = ~UINT64C(0) << (4 * n);
+    uint64 m2 = m3 ^ (m3 << (4 * n));
+    uint64 m1 = m2 ^ (m2 >> (2 * n));
+    uint64 m0 = m1 ^ (m1 >> (1 * n));
+    uint64 y;
+    // perform summations in parallel
+    y = x & m0; x -= y; x += y >> n; n *= 2; // four summations
+    y = x & m1; x -= y; x += y >> n; n *= 2; // two summations
+    y = x & m2; x -= y; x += y >> n; n *= 2; // final summation
+    return x;
+  }
+
+  static const uint lbits = 8;              // 64 bits partitioned into 8
+  static const uint hbits = 2 * (dims - 1); // log2(4^d * maxprec / 2^lbits)
+
+  uint64* data;         // block offset array
+  size_t blocks;        // number of blocks
+  size_t block;         // current block index
+  bitstream_offset end; // offset to last block
+  bitstream_offset ptr; // offset to current set of blocks
+  size_t buffer[8];     // sizes of 8 blocks to be stored together
+};
+
+} // index
+} // zfp
+
+#endif
diff --git a/array/zfp/cache.h b/include/zfp/internal/array/cache.hpp
similarity index 82%
rename from array/zfp/cache.h
rename to include/zfp/internal/array/cache.hpp
index 3630910a..533c37db 100644
--- a/array/zfp/cache.h
+++ b/include/zfp/internal/array/cache.hpp
@@ -1,7 +1,7 @@
-#ifndef ZFP_CACHE_H
-#define ZFP_CACHE_H
+#ifndef ZFP_CACHE_HPP
+#define ZFP_CACHE_HPP
 
-#include "memory.h"
+#include "zfp/internal/array/memory.hpp"
 
 #ifdef ZFP_WITH_CACHE_PROFILE
   // maintain stats on hit and miss rates
@@ -9,6 +9,7 @@
 #endif
 
 namespace zfp {
+namespace internal {
 
 // direct-mapped or two-way skew-associative write-back cache
 template <class Line>
@@ -87,11 +88,11 @@ class Cache {
   };
 
   // allocate cache with at least minsize lines
-  Cache(uint minsize = 0) : tag(0), line(0)
+  Cache(uint minsize = 0) : mask(0), tag(0), line(0)
   {
     resize(minsize);
 #ifdef ZFP_WITH_CACHE_PROFILE
-    std::cerr << "cache lines=" << mask + 1 << std::endl;
+    std::cerr << "cache lines=" << size() << std::endl;
     hit[0][0] = hit[1][0] = miss[0] = back[0] = 0;
     hit[0][1] = hit[1][1] = miss[1] = back[1] = 0;
 #endif
@@ -106,8 +107,8 @@ class Cache {
   // destructor
   ~Cache()
   {
-    zfp::deallocate_aligned(tag);
-    zfp::deallocate_aligned(line);
+    zfp::internal::deallocate_aligned(tag);
+    zfp::internal::deallocate_aligned(line);
 #ifdef ZFP_WITH_CACHE_PROFILE
     std::cerr << "cache R1=" << hit[0][0] << " R2=" << hit[1][0] << " RM=" << miss[0] << " RB=" << back[0]
               <<      " W1=" << hit[0][1] << " W2=" << hit[1][1] << " WM=" << miss[1] << " WB=" << back[1] << std::endl;
@@ -122,29 +123,47 @@ class Cache {
     return *this;
   }
 
+  // byte size of cache data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_CACHE)
+      size += this->size() * (sizeof(*tag) + sizeof(*line));
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
   // cache size in number of lines
   uint size() const { return mask + 1; }
 
   // change cache size to at least minsize lines (all contents will be lost)
   void resize(uint minsize)
   {
+    // compute smallest value of mask such that mask + 1 = 2^k >= minsize
     for (mask = minsize ? minsize - 1 : 1; mask & (mask + 1); mask |= mask + 1);
-    zfp::reallocate_aligned(tag, ((size_t)mask + 1) * sizeof(Tag), 0x100);
-    zfp::reallocate_aligned(line, ((size_t)mask + 1) * sizeof(Line), 0x100);
+    zfp::internal::reallocate_aligned(tag, size() * sizeof(Tag), ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::reallocate_aligned(line, size() * sizeof(Line), ZFP_MEMORY_ALIGNMENT);
     clear();
   }
 
   // look up cache line #x and return pointer to it if in the cache;
   // otherwise return null
-  const Line* lookup(Index x) const
+  Line* lookup(Index x, bool write)
   {
     uint i = primary(x);
-    if (tag[i].index() == x)
+    if (tag[i].index() == x) {
+      if (write)
+        tag[i].mark();
       return line + i;
+    }
 #ifdef ZFP_WITH_CACHE_TWOWAY
     uint j = secondary(x);
-    if (tag[j].index() == x)
+    if (tag[j].index() == x) {
+      if (write)
+        tag[i].mark();
       return line + j;
+    }
 #endif
     return 0;
   }
@@ -211,8 +230,8 @@ class Cache {
   void deep_copy(const Cache& c)
   {
     mask = c.mask;
-    zfp::clone_aligned(tag, c.tag, mask + 1, 0x100u);
-    zfp::clone_aligned(line, c.line, mask + 1, 0x100u);
+    zfp::internal::clone_aligned(tag, c.tag, size(), ZFP_MEMORY_ALIGNMENT);
+    zfp::internal::clone_aligned(line, c.line, size(), ZFP_MEMORY_ALIGNMENT);
 #ifdef ZFP_WITH_CACHE_PROFILE
     hit[0][0] = c.hit[0][0];
     hit[0][1] = c.hit[0][1];
@@ -256,6 +275,7 @@ class Cache {
 #endif
 };
 
-}
+} // internal
+} // zfp
 
 #endif
diff --git a/include/zfp/internal/array/cache1.hpp b/include/zfp/internal/array/cache1.hpp
new file mode 100644
index 00000000..24f192e5
--- /dev/null
+++ b/include/zfp/internal/array/cache1.hpp
@@ -0,0 +1,201 @@
+#ifndef ZFP_CACHE1_HPP
+#define ZFP_CACHE1_HPP
+
+#include "zfp/internal/array/cache.hpp"
+
+namespace zfp {
+namespace internal {
+
+template <typename Scalar, class Store>
+class BlockCache1 {
+public:
+  // constructor of cache of given size
+  BlockCache1(Store& store, size_t bytes = 0) :
+    cache(lines(bytes, store.blocks())),
+    store(store)
+  {}
+
+  // byte size of cache data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // cache size in number of bytes (cache line payload data only)
+  size_t size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (inferred from blocks if zero)
+  void resize(size_t bytes)
+  {
+    flush();
+    cache.resize(lines(bytes, store.blocks()));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush() const
+  {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        size_t block_index = p->tag.index() - 1;
+        store.encode(block_index, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockCache1& c) { cache = c.cache; }
+
+  // inspector
+  Scalar get(size_t i) const
+  {
+    const CacheLine* p = line(i, false);
+    return (*p)(i);
+  }
+
+  // mutator
+  void set(size_t i, Scalar val)
+  {
+    CacheLine* p = line(i, true);
+    (*p)(i) = val;
+  }
+
+  // reference to cached element
+  Scalar& ref(size_t i)
+  {
+    CacheLine* p = line(i, true);
+    return (*p)(i);
+  }
+
+  // read-no-allocate: copy block from cache on hit, else from store without caching
+  void get_block(size_t block_index, Scalar* p, ptrdiff_t sx) const
+  {
+    const CacheLine* line = cache.lookup((uint)block_index + 1, false);
+    if (line)
+      line->get(p, sx, store.block_shape(block_index));
+    else
+      store.decode(block_index, p, sx);
+  }
+
+  // write-no-allocate: copy block to cache on hit, else to store without caching
+  void put_block(size_t block_index, const Scalar* p, ptrdiff_t sx)
+  {
+    CacheLine* line = cache.lookup((uint)block_index + 1, true);
+    if (line)
+      line->put(p, sx, store.block_shape(block_index));
+    else
+      store.encode(block_index, p, sx);
+  }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    // accessors
+    Scalar operator()(size_t i) const { return a[index(i)]; }
+    Scalar& operator()(size_t i) { return a[index(i)]; }
+
+    // pointer to decompressed block data
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+
+    // copy whole block from cache line
+    void get(Scalar* p, ptrdiff_t sx) const
+    {
+      const Scalar* q = a;
+      for (uint x = 0; x < 4; x++, p += sx, q++)
+        *p = *q;
+    }
+
+    // copy partial block from cache line
+    void get(Scalar* p, ptrdiff_t sx, uint shape) const
+    {
+      if (!shape)
+        get(p, sx);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint x = 0; x < nx; x++, p += sx, q++)
+          *p = *q;
+      }
+    }
+
+    // copy whole block to cache line
+    void put(const Scalar* p, ptrdiff_t sx)
+    {
+      Scalar* q = a;
+      for (uint x = 0; x < 4; x++, p += sx, q++)
+        *q = *p;
+    }
+
+    // copy partial block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, uint shape)
+    {
+      if (!shape)
+        put(p, sx);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        Scalar* q = a;
+        for (uint x = 0; x < nx; x++, p += sx, q++)
+          *q = *p;
+      }
+    }
+
+  protected:
+    static size_t index(size_t i) { return (i & 3u); }
+    Scalar a[4];
+  };
+
+  // return cache line for i; may require write-back and fetch
+  CacheLine* line(size_t i, bool write) const
+  {
+    CacheLine* p = 0;
+    size_t block_index = store.block_index(i);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    size_t stored_block_index = tag.index() - 1;
+    if (stored_block_index != block_index) {
+      // write back occupied cache line if it is dirty
+      if (tag.dirty())
+        store.encode(stored_block_index, p->data());
+      // fetch cache line
+      store.decode(block_index, p->data());
+    }
+    return p;
+  }
+
+  // default number of cache lines for array with given number of blocks
+  static uint lines(size_t blocks)
+  {
+    // compute m = O(sqrt(n))
+    size_t m;
+    for (m = 1; m * m < blocks; m *= 2);
+    return static_cast<uint>(m);
+  }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t bytes, size_t blocks)
+  {
+    // ensure block index fits in tag
+    if (blocks >> ((sizeof(uint) * CHAR_BIT) - 1))
+      throw zfp::exception("zfp array too large for cache");
+    uint n = bytes ? static_cast<uint>((bytes + sizeof(CacheLine) - 1) / sizeof(CacheLine)) : lines(blocks);
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+  Store& store;                   // store backed by cache
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/cache2.hpp b/include/zfp/internal/array/cache2.hpp
new file mode 100644
index 00000000..e7aa07d9
--- /dev/null
+++ b/include/zfp/internal/array/cache2.hpp
@@ -0,0 +1,207 @@
+#ifndef ZFP_CACHE2_HPP
+#define ZFP_CACHE2_HPP
+
+#include "zfp/internal/array/cache.hpp"
+
+namespace zfp {
+namespace internal {
+
+template <typename Scalar, class Store>
+class BlockCache2 {
+public:
+  // constructor of cache of given size
+  BlockCache2(Store& store, size_t bytes = 0) :
+    cache(lines(bytes, store.blocks())),
+    store(store)
+  {}
+
+  // byte size of cache data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // cache size in number of bytes (cache line payload data only)
+  size_t size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (inferred from blocks if zero)
+  void resize(size_t bytes)
+  {
+    flush();
+    cache.resize(lines(bytes, store.blocks()));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush() const
+  {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        size_t block_index = p->tag.index() - 1;
+        store.encode(block_index, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockCache2& c) { cache = c.cache; }
+
+  // inspector
+  Scalar get(size_t i, size_t j) const
+  {
+    const CacheLine* p = line(i, j, false);
+    return (*p)(i, j);
+  }
+
+  // mutator
+  void set(size_t i, size_t j, Scalar val)
+  {
+    CacheLine* p = line(i, j, true);
+    (*p)(i, j) = val;
+  }
+
+  // reference to cached element
+  Scalar& ref(size_t i, size_t j)
+  {
+    CacheLine* p = line(i, j, true);
+    return (*p)(i, j);
+  }
+
+  // read-no-allocate: copy block from cache on hit, else from store without caching
+  void get_block(size_t block_index, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    const CacheLine* line = cache.lookup((uint)block_index + 1, false);
+    if (line)
+      line->get(p, sx, sy, store.block_shape(block_index));
+    else
+      store.decode(block_index, p, sx, sy);
+  }
+
+  // write-no-allocate: copy block to cache on hit, else to store without caching
+  void put_block(size_t block_index, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+  {
+    CacheLine* line = cache.lookup((uint)block_index + 1, true);
+    if (line)
+      line->put(p, sx, sy, store.block_shape(block_index));
+    else
+      store.encode(block_index, p, sx, sy);
+  }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    // accessors
+    Scalar operator()(size_t i, size_t j) const { return a[index(i, j)]; }
+    Scalar& operator()(size_t i, size_t j) { return a[index(i, j)]; }
+
+    // pointer to decompressed block data
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+
+    // copy whole block from cache line
+    void get(Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+    {
+      const Scalar* q = a;
+      for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+        for (uint x = 0; x < 4; x++, p += sx, q++)
+          *p = *q;
+    }
+
+    // copy partial block from cache line
+    void get(Scalar* p, ptrdiff_t sx, ptrdiff_t sy, uint shape) const
+    {
+      if (!shape)
+        get(p, sx, sy);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (uint x = 0; x < nx; x++, p += sx, q++)
+            *p = *q;
+      }
+    }
+
+    // copy whole block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+    {
+      Scalar* q = a;
+      for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+        for (uint x = 0; x < 4; x++, p += sx, q++)
+          *q = *p;
+    }
+
+    // copy partial block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, uint shape)
+    {
+      if (!shape)
+        put(p, sx, sy);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        Scalar* q = a;
+        for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+          for (uint x = 0; x < nx; x++, p += sx, q++)
+            *q = *p;
+      }
+    }
+
+  protected:
+    static size_t index(size_t i, size_t j) { return (i & 3u) + 4 * (j & 3u); }
+    Scalar a[4 * 4];
+  };
+
+  // return cache line for (i, j); may require write-back and fetch
+  CacheLine* line(size_t i, size_t j, bool write) const
+  {
+    CacheLine* p = 0;
+    size_t block_index = store.block_index(i, j);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    size_t stored_block_index = tag.index() - 1;
+    if (stored_block_index != block_index) {
+      // write back occupied cache line if it is dirty
+      if (tag.dirty())
+        store.encode(stored_block_index, p->data());
+      // fetch cache line
+      store.decode(block_index, p->data());
+    }
+    return p;
+  }
+
+  // default number of cache lines for array with given number of blocks
+  static uint lines(size_t blocks)
+  {
+    // compute m = O(sqrt(n))
+    size_t m;
+    for (m = 1; m * m < blocks; m *= 2);
+    return static_cast<uint>(m);
+  }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t bytes, size_t blocks)
+  {
+    // ensure block index fits in tag
+    if (blocks >> ((sizeof(uint) * CHAR_BIT) - 1))
+      throw zfp::exception("zfp array too large for cache");
+    uint n = bytes ? static_cast<uint>((bytes + sizeof(CacheLine) - 1) / sizeof(CacheLine)) : lines(blocks);
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+  Store& store;                   // store backed by cache
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/cache3.hpp b/include/zfp/internal/array/cache3.hpp
new file mode 100644
index 00000000..1c4c9554
--- /dev/null
+++ b/include/zfp/internal/array/cache3.hpp
@@ -0,0 +1,213 @@
+#ifndef ZFP_CACHE3_HPP
+#define ZFP_CACHE3_HPP
+
+#include "zfp/internal/array/cache.hpp"
+
+namespace zfp {
+namespace internal {
+
+template <typename Scalar, class Store>
+class BlockCache3 {
+public:
+  // constructor of cache of given size
+  BlockCache3(Store& store, size_t bytes = 0) :
+    cache(lines(bytes, store.blocks())),
+    store(store)
+  {}
+
+  // byte size of cache data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // cache size in number of bytes (cache line payload data only)
+  size_t size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (inferred from blocks if zero)
+  void resize(size_t bytes)
+  {
+    flush();
+    cache.resize(lines(bytes, store.blocks()));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush() const
+  {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        size_t block_index = p->tag.index() - 1;
+        store.encode(block_index, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockCache3& c) { cache = c.cache; }
+
+  // inspector
+  Scalar get(size_t i, size_t j, size_t k) const
+  {
+    const CacheLine* p = line(i, j, k, false);
+    return (*p)(i, j, k);
+  }
+
+  // mutator
+  void set(size_t i, size_t j, size_t k, Scalar val)
+  {
+    CacheLine* p = line(i, j, k, true);
+    (*p)(i, j, k) = val;
+  }
+
+  // reference to cached element
+  Scalar& ref(size_t i, size_t j, size_t k)
+  {
+    CacheLine* p = line(i, j, k, true);
+    return (*p)(i, j, k);
+  }
+
+  // read-no-allocate: copy block from cache on hit, else from store without caching
+  void get_block(size_t block_index, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    const CacheLine* line = cache.lookup((uint)block_index + 1, false);
+    if (line)
+      line->get(p, sx, sy, sz, store.block_shape(block_index));
+    else
+      store.decode(block_index, p, sx, sy, sz);
+  }
+
+  // write-no-allocate: copy block to cache on hit, else to store without caching
+  void put_block(size_t block_index, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    CacheLine* line = cache.lookup((uint)block_index + 1, true);
+    if (line)
+      line->put(p, sx, sy, sz, store.block_shape(block_index));
+    else
+      store.encode(block_index, p, sx, sy, sz);
+  }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    // accessors
+    Scalar operator()(size_t i, size_t j, size_t k) const { return a[index(i, j, k)]; }
+    Scalar& operator()(size_t i, size_t j, size_t k) { return a[index(i, j, k)]; }
+
+    // pointer to decompressed block data
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+
+    // copy whole block from cache line
+    void get(Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+    {
+      const Scalar* q = a;
+      for (uint z = 0; z < 4; z++, p += sz - 4 * sy)
+        for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+          for (uint x = 0; x < 4; x++, p += sx, q++)
+            *p = *q;
+    }
+
+    // copy partial block from cache line
+    void get(Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, uint shape) const
+    {
+      if (!shape)
+        get(p, sx, sy, sz);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        uint nz = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+          for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+            for (uint x = 0; x < nx; x++, p += sx, q++)
+              *p = *q;
+      }
+    }
+
+    // copy whole block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+    {
+      Scalar* q = a;
+      for (uint z = 0; z < 4; z++, p += sz - 4 * sy)
+        for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+          for (uint x = 0; x < 4; x++, p += sx, q++)
+            *q = *p;
+    }
+
+    // copy partial block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, uint shape)
+    {
+      if (!shape)
+        put(p, sx, sy, sz);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        uint nz = 4 - (shape & 3u); shape >>= 2;
+        Scalar* q = a;
+        for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+          for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+            for (uint x = 0; x < nx; x++, p += sx, q++)
+              *q = *p;
+      }
+    }
+
+  protected:
+    static size_t index(size_t i, size_t j, size_t k) { return (i & 3u) + 4 * ((j & 3u) + 4 * (k & 3u)); }
+    Scalar a[4 * 4 * 4];
+  };
+
+  // return cache line for (i, j, k); may require write-back and fetch
+  CacheLine* line(size_t i, size_t j, size_t k, bool write) const
+  {
+    CacheLine* p = 0;
+    size_t block_index = store.block_index(i, j, k);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    size_t stored_block_index = tag.index() - 1;
+    if (stored_block_index != block_index) {
+      // write back occupied cache line if it is dirty
+      if (tag.dirty())
+        store.encode(stored_block_index, p->data());
+      // fetch cache line
+      store.decode(block_index, p->data());
+    }
+    return p;
+  }
+
+  // default number of cache lines for array with given number of blocks
+  static uint lines(size_t blocks)
+  {
+    // compute m = O(sqrt(n))
+    size_t m;
+    for (m = 1; m * m < blocks; m *= 2);
+    return static_cast<uint>(m);
+  }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t bytes, size_t blocks)
+  {
+    // ensure block index fits in tag
+    if (blocks >> ((sizeof(uint) * CHAR_BIT) - 1))
+      throw zfp::exception("zfp array too large for cache");
+    uint n = bytes ? static_cast<uint>((bytes + sizeof(CacheLine) - 1) / sizeof(CacheLine)) : lines(blocks);
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+  Store& store;                   // store backed by cache
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/cache4.hpp b/include/zfp/internal/array/cache4.hpp
new file mode 100644
index 00000000..69182b7e
--- /dev/null
+++ b/include/zfp/internal/array/cache4.hpp
@@ -0,0 +1,219 @@
+#ifndef ZFP_CACHE4_HPP
+#define ZFP_CACHE4_HPP
+
+#include "zfp/internal/array/cache.hpp"
+
+namespace zfp {
+namespace internal {
+
+template <typename Scalar, class Store>
+class BlockCache4 {
+public:
+  // constructor of cache of given size
+  BlockCache4(Store& store, size_t bytes = 0) :
+    cache(lines(bytes, store.blocks())),
+    store(store)
+  {}
+
+  // byte size of cache data structure components indicated by mask
+  size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += cache.size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // cache size in number of bytes (cache line payload data only)
+  size_t size() const { return cache.size() * sizeof(CacheLine); }
+
+  // set minimum cache size in bytes (inferred from blocks if zero)
+  void resize(size_t bytes)
+  {
+    flush();
+    cache.resize(lines(bytes, store.blocks()));
+  }
+
+  // empty cache without compressing modified cached blocks
+  void clear() const { cache.clear(); }
+
+  // flush cache by compressing all modified cached blocks
+  void flush() const
+  {
+    for (typename zfp::internal::Cache<CacheLine>::const_iterator p = cache.first(); p; p++) {
+      if (p->tag.dirty()) {
+        size_t block_index = p->tag.index() - 1;
+        store.encode(block_index, p->line->data());
+      }
+      cache.flush(p->line);
+    }
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockCache4& c) { cache = c.cache; }
+
+  // inspector
+  Scalar get(size_t i, size_t j, size_t k, size_t l) const
+  {
+    const CacheLine* p = line(i, j, k, l, false);
+    return (*p)(i, j, k, l);
+  }
+
+  // mutator
+  void set(size_t i, size_t j, size_t k, size_t l, Scalar val)
+  {
+    CacheLine* p = line(i, j, k, l, true);
+    (*p)(i, j, k, l) = val;
+  }
+
+  // reference to cached element
+  Scalar& ref(size_t i, size_t j, size_t k, size_t l)
+  {
+    CacheLine* p = line(i, j, k, l, true);
+    return (*p)(i, j, k, l);
+  }
+
+  // read-no-allocate: copy block from cache on hit, else from store without caching
+  void get_block(size_t block_index, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    const CacheLine* line = cache.lookup((uint)block_index + 1, false);
+    if (line)
+      line->get(p, sx, sy, sz, sw, store.block_shape(block_index));
+    else
+      store.decode(block_index, p, sx, sy, sz, sw);
+  }
+
+  // write-no-allocate: copy block to cache on hit, else to store without caching
+  void put_block(size_t block_index, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    CacheLine* line = cache.lookup((uint)block_index + 1, true);
+    if (line)
+      line->put(p, sx, sy, sz, sw, store.block_shape(block_index));
+    else
+      store.encode(block_index, p, sx, sy, sz, sw);
+  }
+
+protected:
+  // cache line representing one block of decompressed values
+  class CacheLine {
+  public:
+    // accessors
+    Scalar operator()(size_t i, size_t j, size_t k, size_t l) const { return a[index(i, j, k, l)]; }
+    Scalar& operator()(size_t i, size_t j, size_t k, size_t l) { return a[index(i, j, k, l)]; }
+
+    // pointer to decompressed block data
+    const Scalar* data() const { return a; }
+    Scalar* data() { return a; }
+
+    // copy whole block from cache line
+    void get(Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+    {
+      const Scalar* q = a;
+      for (uint w = 0; w < 4; w++, p += sw - 4 * sz)
+        for (uint z = 0; z < 4; z++, p += sz - 4 * sy)
+          for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+            for (uint x = 0; x < 4; x++, p += sx, q++)
+              *p = *q;
+    }
+
+    // copy partial block from cache line
+    void get(Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw, uint shape) const
+    {
+      if (!shape)
+        get(p, sx, sy, sz, sw);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        uint nz = 4 - (shape & 3u); shape >>= 2;
+        uint nw = 4 - (shape & 3u); shape >>= 2;
+        const Scalar* q = a;
+        for (uint w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 64 - 16 * nz)
+          for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+            for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+              for (uint x = 0; x < nx; x++, p += sx, q++)
+                *p = *q;
+      }
+    }
+
+    // copy whole block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+    {
+      Scalar* q = a;
+      for (uint w = 0; w < 4; w++, p += sw - 4 * sz)
+        for (uint z = 0; z < 4; z++, p += sz - 4 * sy)
+          for (uint y = 0; y < 4; y++, p += sy - 4 * sx)
+            for (uint x = 0; x < 4; x++, p += sx, q++)
+              *q = *p;
+    }
+
+    // copy partial block to cache line
+    void put(const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw, uint shape)
+    {
+      if (!shape)
+        put(p, sx, sy, sz, sw);
+      else {
+        // determine block dimensions
+        uint nx = 4 - (shape & 3u); shape >>= 2;
+        uint ny = 4 - (shape & 3u); shape >>= 2;
+        uint nz = 4 - (shape & 3u); shape >>= 2;
+        uint nw = 4 - (shape & 3u); shape >>= 2;
+        Scalar* q = a;
+        for (uint w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 64 - 16 * nz)
+          for (uint z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 16 - 4 * ny)
+            for (uint y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
+              for (uint x = 0; x < nx; x++, p += sx, q++)
+                *q = *p;
+      }
+    }
+
+  protected:
+    static size_t index(size_t i, size_t j, size_t k, size_t l) { return (i & 3u) + 4 * ((j & 3u) + 4 * ((k & 3u) + 4 * (l & 3u))); }
+    Scalar a[4 * 4 * 4 * 4];
+  };
+
+  // return cache line for (i, j, k, l); may require write-back and fetch
+  CacheLine* line(size_t i, size_t j, size_t k, size_t l, bool write) const
+  {
+    CacheLine* p = 0;
+    size_t block_index = store.block_index(i, j, k, l);
+    typename zfp::internal::Cache<CacheLine>::Tag tag = cache.access(p, (uint)block_index + 1, write);
+    size_t stored_block_index = tag.index() - 1;
+    if (stored_block_index != block_index) {
+      // write back occupied cache line if it is dirty
+      if (tag.dirty())
+        store.encode(stored_block_index, p->data());
+      // fetch cache line
+      store.decode(block_index, p->data());
+    }
+    return p;
+  }
+
+  // default number of cache lines for array with given number of blocks
+  static uint lines(size_t blocks)
+  {
+    // compute m = O(sqrt(n))
+    size_t m;
+    for (m = 1; m * m < blocks; m *= 2);
+    return static_cast<uint>(m);
+  }
+
+  // number of cache lines corresponding to size (or suggested size if zero)
+  static uint lines(size_t bytes, size_t blocks)
+  {
+    // ensure block index fits in tag
+    if (blocks >> ((sizeof(uint) * CHAR_BIT) - 1))
+      throw zfp::exception("zfp array too large for cache");
+    uint n = bytes ? static_cast<uint>((bytes + sizeof(CacheLine) - 1) / sizeof(CacheLine)) : lines(blocks);
+    return std::max(n, 1u);
+  }
+
+  mutable Cache<CacheLine> cache; // cache of decompressed blocks
+  Store& store;                   // store backed by cache
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/exception.hpp b/include/zfp/internal/array/exception.hpp
new file mode 100644
index 00000000..747bf6bd
--- /dev/null
+++ b/include/zfp/internal/array/exception.hpp
@@ -0,0 +1,18 @@
+#ifndef ZFP_EXCEPTION_HPP
+#define ZFP_EXCEPTION_HPP
+
+#include <stdexcept>
+#include <string>
+
+namespace zfp {
+
+// generic exception thrown by array constructors
+class exception : public std::runtime_error {
+public:
+  exception(const std::string& msg) : runtime_error(msg) {}
+  virtual ~exception() throw() {}
+};
+
+}
+
+#endif
diff --git a/include/zfp/internal/array/handle1.hpp b/include/zfp/internal/array/handle1.hpp
new file mode 100644
index 00000000..72f5e91b
--- /dev/null
+++ b/include/zfp/internal/array/handle1.hpp
@@ -0,0 +1,38 @@
+#ifndef ZFP_HANDLE1_HPP
+#define ZFP_HANDLE1_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim1 {
+
+// forward declarations
+template <class Container> class const_reference;
+template <class Container> class const_pointer;
+template <class Container> class const_iterator;
+template <class Container> class reference;
+template <class Container> class pointer;
+template <class Container> class iterator;
+
+// const handle to a 1D array or view element
+template <class Container>
+class const_handle {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+protected:
+  // protected constructor
+  explicit const_handle(const container_type* container, size_t x) : container(const_cast<container_type*>(container)), x(x) {}
+
+  // dereference handle
+  value_type get() const { return container->get(x); }
+
+  container_type* container; // container
+  size_t x;                  // global element index
+};
+
+} // dim1
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/handle2.hpp b/include/zfp/internal/array/handle2.hpp
new file mode 100644
index 00000000..17b5043e
--- /dev/null
+++ b/include/zfp/internal/array/handle2.hpp
@@ -0,0 +1,38 @@
+#ifndef ZFP_HANDLE2_HPP
+#define ZFP_HANDLE2_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim2 {
+
+// forward declarations
+template <class Container> class const_reference;
+template <class Container> class const_pointer;
+template <class Container> class const_iterator;
+template <class Container> class reference;
+template <class Container> class pointer;
+template <class Container> class iterator;
+
+// const handle to a 2D array or view element
+template <class Container>
+class const_handle {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+protected:
+  // protected constructor
+  explicit const_handle(const container_type* container, size_t x, size_t y) : container(const_cast<container_type*>(container)), x(x), y(y) {}
+
+  // dereference handle
+  value_type get() const { return container->get(x, y); }
+
+  container_type* container; // container
+  size_t x, y;               // global element index
+};
+
+} // dim2
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/handle3.hpp b/include/zfp/internal/array/handle3.hpp
new file mode 100644
index 00000000..139b1d55
--- /dev/null
+++ b/include/zfp/internal/array/handle3.hpp
@@ -0,0 +1,38 @@
+#ifndef ZFP_HANDLE3_HPP
+#define ZFP_HANDLE3_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim3 {
+
+// forward declarations
+template <class Container> class const_reference;
+template <class Container> class const_pointer;
+template <class Container> class const_iterator;
+template <class Container> class reference;
+template <class Container> class pointer;
+template <class Container> class iterator;
+
+// const handle to a 3D array or view element
+template <class Container>
+class const_handle {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+protected:
+  // protected constructor
+  explicit const_handle(const container_type* container, size_t x, size_t y, size_t z) : container(const_cast<container_type*>(container)), x(x), y(y), z(z) {}
+
+  // dereference handle
+  value_type get() const { return container->get(x, y, z); }
+
+  container_type* container; // container
+  size_t x, y, z;            // global element index
+};
+
+} // dim3
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/handle4.hpp b/include/zfp/internal/array/handle4.hpp
new file mode 100644
index 00000000..da9ca385
--- /dev/null
+++ b/include/zfp/internal/array/handle4.hpp
@@ -0,0 +1,38 @@
+#ifndef ZFP_HANDLE4_HPP
+#define ZFP_HANDLE4_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim4 {
+
+// forward declarations
+template <class Container> class const_reference;
+template <class Container> class const_pointer;
+template <class Container> class const_iterator;
+template <class Container> class reference;
+template <class Container> class pointer;
+template <class Container> class iterator;
+
+// const handle to a 4D array or view element
+template <class Container>
+class const_handle {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+protected:
+  // protected constructor
+  explicit const_handle(const container_type* container, size_t x, size_t y, size_t z, size_t w) : container(const_cast<container_type*>(container)), x(x), y(y), z(z), w(w) {}
+
+  // dereference handle
+  value_type get() const { return container->get(x, y, z, w); }
+
+  container_type* container; // container
+  size_t x, y, z, w;         // global element index
+};
+
+} // dim4
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/header.hpp b/include/zfp/internal/array/header.hpp
new file mode 100644
index 00000000..7d9146de
--- /dev/null
+++ b/include/zfp/internal/array/header.hpp
@@ -0,0 +1,41 @@
+// abstract base class for array header
+class header {
+public:
+  // default constructor
+  header() :
+    type(zfp_type_none),
+    nx(0), ny(0), nz(0), nw(0)
+  {}
+
+  // constructor
+  header(const zfp::array& a) :
+    type(a.type),
+    nx(a.nx), ny(a.ny), nz(a.nz), nw(a.nw)
+  {}
+
+  // destructor
+  virtual ~header() {}
+
+  // array scalar type
+  zfp_type scalar_type() const { return type; }
+
+  // array dimensionality
+  uint dimensionality() const { return nw ? 4 : nz ? 3 : ny ? 2 : nx ? 1 : 0; }
+
+  // array dimensions
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+  size_t size_w() const { return nw; }
+
+  // rate in bits per value
+  virtual double rate() const = 0;
+
+  // header payload: data pointer and byte size
+  virtual const void* data() const = 0;
+  virtual size_t size_bytes(uint mask = ZFP_DATA_HEADER) const = 0;
+
+protected:
+  zfp_type type;         // array scalar type
+  size_t nx, ny, nz, nw; // array dimensions
+};
diff --git a/include/zfp/internal/array/iterator1.hpp b/include/zfp/internal/array/iterator1.hpp
new file mode 100644
index 00000000..73d5197d
--- /dev/null
+++ b/include/zfp/internal/array/iterator1.hpp
@@ -0,0 +1,137 @@
+#ifndef ZFP_ITERATOR1_HPP
+#define ZFP_ITERATOR1_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim1 {
+
+// random access const iterator that visits 1D array or view block by block
+template <class Container>
+class const_iterator : public const_handle<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim1::reference<Container> reference;
+  typedef zfp::internal::dim1::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  typedef zfp::internal::dim1::const_reference<Container> const_reference;
+  typedef zfp::internal::dim1::const_pointer<Container> const_pointer;
+
+  // default constructor
+  const_iterator() : const_handle<Container>(0, 0) {}
+
+  // constructor
+  explicit const_iterator(const container_type* container, size_t x) : const_handle<Container>(container, x) {}
+
+  // dereference iterator
+  const_reference operator*() const { return const_reference(container, x); }
+  const_reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  const_iterator operator+(difference_type d) const { const_iterator it = *this; it.advance(d); return it; }
+  const_iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const const_iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const const_iterator& it) const { return container == it.container && x == it.x; }
+  bool operator!=(const const_iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const const_iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const const_iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const const_iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const const_iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  const_iterator& operator++() { increment(); return *this; }
+  const_iterator& operator--() { decrement(); return *this; }
+  const_iterator operator++(int) { const_iterator it = *this; increment(); return it; }
+  const_iterator operator--(int) { const_iterator it = *this; decrement(); return it; }
+  const_iterator operator+=(difference_type d) { advance(+d); return *this; }
+  const_iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+  // local container index of value referenced by iterator
+  size_t i() const { return x - container->min_x(); }
+
+protected:
+  // sequential offset associated with index x plus delta d
+  difference_type offset(difference_type d = 0) const { return static_cast<difference_type>(x - container->min_x() + size_t(d)); }
+
+  // index x associated with sequential offset p
+  void index(size_t& x, difference_type p) const { x = container->min_x() + size_t(p); }
+
+  // advance iterator by d
+  void advance(difference_type d) { index(x, offset(d)); }
+
+  // increment iterator to next element
+  void increment() { ++x; }
+
+  // decrement iterator to previous element
+  void decrement() { --x; }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+};
+
+// random access iterator that visits 1D array or view block by block
+template <class Container>
+class iterator : public const_iterator<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim1::reference<Container> reference;
+  typedef zfp::internal::dim1::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  // default constructor
+  iterator() : const_iterator<Container>(0, 0) {}
+
+  // constructor
+  explicit iterator(container_type* container, size_t i) : const_iterator<Container>(container, i) {}
+
+  // dereference iterator
+  reference operator*() const { return reference(container, x); }
+  reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  iterator operator+(difference_type d) const { iterator it = *this; it.advance(d); return it; }
+  iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const iterator& it) const { return container == it.container && x == it.x; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  iterator& operator++() { increment(); return *this; }
+  iterator& operator--() { decrement(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  iterator operator--(int) { iterator it = *this; decrement(); return it; }
+  iterator operator+=(difference_type d) { advance(+d); return *this; }
+  iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+protected:
+  using const_iterator<Container>::offset;
+  using const_iterator<Container>::advance;
+  using const_iterator<Container>::increment;
+  using const_iterator<Container>::decrement;
+  using const_iterator<Container>::container;
+  using const_iterator<Container>::x;
+};
+
+} // dim1
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/iterator2.hpp b/include/zfp/internal/array/iterator2.hpp
new file mode 100644
index 00000000..433d1825
--- /dev/null
+++ b/include/zfp/internal/array/iterator2.hpp
@@ -0,0 +1,230 @@
+#ifndef ZFP_ITERATOR2_HPP
+#define ZFP_ITERATOR2_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim2 {
+
+// random access const iterator that visits 2D array or view block by block
+template <class Container>
+class const_iterator : public const_handle<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim2::reference<Container> reference;
+  typedef zfp::internal::dim2::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  typedef zfp::internal::dim2::const_reference<Container> const_reference;
+  typedef zfp::internal::dim2::const_pointer<Container> const_pointer;
+
+  // default constructor
+  const_iterator() : const_handle<Container>(0, 0, 0) {}
+
+  // constructor
+  explicit const_iterator(const container_type* container, size_t x, size_t y) : const_handle<Container>(container, x, y) {}
+
+  // dereference iterator
+  const_reference operator*() const { return const_reference(container, x, y); }
+  const_reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  const_iterator operator+(difference_type d) const { const_iterator it = *this; it.advance(d); return it; }
+  const_iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const const_iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const const_iterator& it) const { return container == it.container && x == it.x && y == it.y; }
+  bool operator!=(const const_iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const const_iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const const_iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const const_iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const const_iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  const_iterator& operator++() { increment(); return *this; }
+  const_iterator& operator--() { decrement(); return *this; }
+  const_iterator operator++(int) { const_iterator it = *this; increment(); return it; }
+  const_iterator operator--(int) { const_iterator it = *this; decrement(); return it; }
+  const_iterator operator+=(difference_type d) { advance(+d); return *this; }
+  const_iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+  // local container index of value referenced by iterator
+  size_t i() const { return x - container->min_x(); }
+  size_t j() const { return y - container->min_y(); }
+
+protected:
+  // sequential offset associated with index (x, y) plus delta d
+  difference_type offset(difference_type d = 0) const
+  {
+    difference_type p = d;
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t nx = xmax - xmin;
+    size_t ny = ymax - ymin;
+    if (y == ymax)
+      p += nx * ny;
+    else {
+      size_t m = ~size_t(3);
+      size_t by = std::max(y & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p += (by - ymin) * nx;
+      size_t bx = std::max(x & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p += (bx - xmin) * sy;
+      p += (y - by) * sx;
+      p += (x - bx);
+    }
+    return p;
+  }
+
+  // index (x, y) associated with sequential offset p
+  void index(size_t& x, size_t& y, difference_type p) const
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t nx = xmax - xmin;
+    size_t ny = ymax - ymin;
+    if (size_t(p) == nx * ny) {
+      x = xmin;
+      y = ymax;
+    }
+    else {
+      size_t m = ~size_t(3);
+      size_t by = std::max((ymin + size_t(p / ptrdiff_t(nx))) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx;
+      size_t bx = std::max((xmin + size_t(p / ptrdiff_t(sy))) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy;
+      y = by + size_t(p / ptrdiff_t(sx)); p -= (y - by) * sx;
+      x = bx + size_t(p);                 p -= (x - bx);
+    }
+  }
+
+  // advance iterator by d
+  void advance(difference_type d) { index(x, y, offset(d)); }
+
+  // increment iterator to next element
+  void increment()
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t m = ~size_t(3);
+    ++x;
+    if (!(x & 3u) || x == xmax) {
+      x = std::max((x - 1) & m, xmin);
+      ++y;
+      if (!(y & 3u) || y == ymax) {
+        y = std::max((y - 1) & m, ymin);
+        // done with block; advance to next
+        x = (x + 4) & m;
+        if (x >= xmax) {
+          x = xmin;
+          y = (y + 4) & m;
+          if (y >= ymax)
+            y = ymax;
+        }
+      }
+    }
+  }
+
+  // decrement iterator to previous element
+  void decrement()
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t m = ~size_t(3);
+    if (y == ymax) {
+      x = xmax - 1;
+      y = ymax - 1;
+    }
+    else {
+      if (!(x & 3u) || x == xmin) {
+        x = std::min((x + 4) & m, xmax);
+        if (!(y & 3u) || y == ymin) {
+          y = std::min((y + 4) & m, ymax);
+          // done with block; advance to next
+          x = (x - 1) & m;
+          if (x <= xmin) {
+            x = xmax;
+            y = (y - 1) & m;
+            if (y <= ymin)
+              y = ymin;
+          }
+        }
+        --y;
+      }
+      --x;
+    }
+  }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+};
+
+// random access iterator that visits 2D array or view block by block
+template <class Container>
+class iterator : public const_iterator<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim2::reference<Container> reference;
+  typedef zfp::internal::dim2::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  // default constructor
+  iterator() : const_iterator<Container>(0, 0, 0) {}
+
+  // constructor
+  explicit iterator(container_type* container, size_t x, size_t y) : const_iterator<Container>(container, x, y) {}
+
+  // dereference iterator
+  reference operator*() const { return reference(container, x, y); }
+  reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  iterator operator+(difference_type d) const { iterator it = *this; it.advance(d); return it; }
+  iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const iterator& it) const { return container == it.container && x == it.x && y == it.y; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  iterator& operator++() { increment(); return *this; }
+  iterator& operator--() { decrement(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  iterator operator--(int) { iterator it = *this; decrement(); return it; }
+  iterator operator+=(difference_type d) { advance(+d); return *this; }
+  iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+protected:
+  using const_iterator<Container>::offset;
+  using const_iterator<Container>::advance;
+  using const_iterator<Container>::increment;
+  using const_iterator<Container>::decrement;
+  using const_iterator<Container>::container;
+  using const_iterator<Container>::x;
+  using const_iterator<Container>::y;
+};
+
+} // dim2
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/iterator3.hpp b/include/zfp/internal/array/iterator3.hpp
new file mode 100644
index 00000000..aa46b5ff
--- /dev/null
+++ b/include/zfp/internal/array/iterator3.hpp
@@ -0,0 +1,265 @@
+#ifndef ZFP_ITERATOR3_HPP
+#define ZFP_ITERATOR3_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim3 {
+
+// random access const iterator that visits 3D array or view block by block
+template <class Container>
+class const_iterator : public const_handle<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim3::reference<Container> reference;
+  typedef zfp::internal::dim3::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  typedef zfp::internal::dim3::const_reference<Container> const_reference;
+  typedef zfp::internal::dim3::const_pointer<Container> const_pointer;
+
+  // default constructor
+  const_iterator() : const_handle<Container>(0, 0, 0, 0) {}
+
+  // constructor
+  explicit const_iterator(const container_type* container, size_t x, size_t y, size_t z) : const_handle<Container>(container, x, y, z) {}
+
+  // dereference iterator
+  const_reference operator*() const { return const_reference(container, x, y, z); }
+  const_reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  const_iterator operator+(difference_type d) const { const_iterator it = *this; it.advance(d); return it; }
+  const_iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const const_iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const const_iterator& it) const { return container == it.container && x == it.x && y == it.y && z == it.z; }
+  bool operator!=(const const_iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const const_iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const const_iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const const_iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const const_iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  const_iterator& operator++() { increment(); return *this; }
+  const_iterator& operator--() { decrement(); return *this; }
+  const_iterator operator++(int) { const_iterator it = *this; increment(); return it; }
+  const_iterator operator--(int) { const_iterator it = *this; decrement(); return it; }
+  const_iterator operator+=(difference_type d) { advance(+d); return *this; }
+  const_iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+  // local container index of value referenced by iterator
+  size_t i() const { return x - container->min_x(); }
+  size_t j() const { return y - container->min_y(); }
+  size_t k() const { return z - container->min_z(); }
+
+protected:
+  // sequential offset associated with index (x, y, z) plus delta d
+  difference_type offset(difference_type d = 0) const
+  {
+    difference_type p = d;
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t nx = xmax - xmin;
+    size_t ny = ymax - ymin;
+    size_t nz = zmax - zmin;
+    if (z == zmax)
+      p += nx * ny * nz;
+    else {
+      size_t m = ~size_t(3);
+      size_t bz = std::max(z & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p += (bz - zmin) * nx * ny;
+      size_t by = std::max(y & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p += (by - ymin) * nx * sz;
+      size_t bx = std::max(x & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p += (bx - xmin) * sy * sz;
+      p += (z - bz) * sx * sy;
+      p += (y - by) * sx;
+      p += (x - bx);
+    }
+    return p;
+  }
+
+  // index (x, y, z) associated with sequential offset p
+  void index(size_t& x, size_t& y, size_t& z, difference_type p) const
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t nx = xmax - xmin;
+    size_t ny = ymax - ymin;
+    size_t nz = zmax - zmin;
+    if (size_t(p) == nx * ny * nz) {
+      x = xmin;
+      y = ymin;
+      z = zmax;
+    }
+    else {
+      size_t m = ~size_t(3);
+      size_t bz = std::max((zmin + size_t(p / ptrdiff_t(nx * ny))) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny;
+      size_t by = std::max((ymin + size_t(p / ptrdiff_t(nx * sz))) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz;
+      size_t bx = std::max((xmin + size_t(p / ptrdiff_t(sy * sz))) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz;
+      z = bz + size_t(p / ptrdiff_t(sx * sy)); p -= (z - bz) * sx * sy;
+      y = by + size_t(p / ptrdiff_t(sx));      p -= (y - by) * sx;
+      x = bx + size_t(p);                      p -= (x - bx);
+    }
+  }
+
+  // advance iterator by d
+  void advance(difference_type d) { index(x, y, z, offset(d)); }
+
+  // increment iterator to next element
+  void increment()
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t m = ~size_t(3);
+    ++x;
+    if (!(x & 3u) || x == xmax) {
+      x = std::max((x - 1) & m, xmin);
+      ++y;
+      if (!(y & 3u) || y == ymax) {
+        y = std::max((y - 1) & m, ymin);
+        ++z;
+        if (!(z & 3u) || z == zmax) {
+          z = std::max((z - 1) & m, zmin);
+          // done with block; advance to next
+          x = (x + 4) & m;
+          if (x >= xmax) {
+            x = xmin;
+            y = (y + 4) & m;
+            if (y >= ymax) {
+              y = ymin;
+              z = (z + 4) & m;
+              if (z >= zmax)
+                z = zmax;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // decrement iterator to previous element
+  void decrement()
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t m = ~size_t(3);
+    if (z == zmax) {
+      x = xmax - 1;
+      y = ymax - 1;
+      z = zmax - 1;
+    }
+    else {
+      if (!(x & 3u) || x == xmin) {
+        x = std::min((x + 4) & m, xmax);
+        if (!(y & 3u) || y == ymin) {
+          y = std::min((y + 4) & m, ymax);
+          if (!(z & 3u) || z == zmin) {
+            z = std::min((z + 4) & m, zmax);
+            // done with block; advance to next
+            x = (x - 1) & m;
+            if (x <= xmin) {
+              x = xmax;
+              y = (y - 1) & m;
+              if (y <= ymin) {
+                y = ymax;
+                z = (z - 1) & m;
+                if (z <= zmin)
+                  z = zmin;
+              }
+            }
+          }
+          --z;
+        }
+        --y;
+      }
+      --x;
+    }
+  }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+  using const_handle<Container>::z;
+};
+
+// random access iterator that visits 3D array or view block by block
+template <class Container>
+class iterator : public const_iterator<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim3::reference<Container> reference;
+  typedef zfp::internal::dim3::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  // default constructor
+  iterator() : const_iterator<Container>(0, 0, 0, 0) {}
+
+  // constructor
+  explicit iterator(container_type* container, size_t x, size_t y, size_t z) : const_iterator<Container>(container, x, y, z) {}
+
+  // dereference iterator
+  reference operator*() const { return reference(container, x, y, z); }
+  reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  iterator operator+(difference_type d) const { iterator it = *this; it.advance(d); return it; }
+  iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const iterator& it) const { return container == it.container && x == it.x && y == it.y && z == it.z; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  iterator& operator++() { increment(); return *this; }
+  iterator& operator--() { decrement(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  iterator operator--(int) { iterator it = *this; decrement(); return it; }
+  iterator operator+=(difference_type d) { advance(+d); return *this; }
+  iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+protected:
+  using const_iterator<Container>::offset;
+  using const_iterator<Container>::advance;
+  using const_iterator<Container>::increment;
+  using const_iterator<Container>::decrement;
+  using const_iterator<Container>::container;
+  using const_iterator<Container>::x;
+  using const_iterator<Container>::y;
+  using const_iterator<Container>::z;
+};
+
+} // dim3
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/iterator4.hpp b/include/zfp/internal/array/iterator4.hpp
new file mode 100644
index 00000000..00b941a5
--- /dev/null
+++ b/include/zfp/internal/array/iterator4.hpp
@@ -0,0 +1,300 @@
+#ifndef ZFP_ITERATOR4_HPP
+#define ZFP_ITERATOR4_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim4 {
+
+// random access const iterator that visits 4D array or view block by block
+template <class Container>
+class const_iterator : public const_handle<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim4::reference<Container> reference;
+  typedef zfp::internal::dim4::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  typedef zfp::internal::dim4::const_reference<Container> const_reference;
+  typedef zfp::internal::dim4::const_pointer<Container> const_pointer;
+
+  // default constructor
+  const_iterator() : const_handle<Container>(0, 0, 0, 0, 0) {}
+
+  // constructor
+  explicit const_iterator(const container_type* container, size_t x, size_t y, size_t z, size_t w) : const_handle<Container>(container, x, y, z, w) {}
+
+  // dereference iterator
+  const_reference operator*() const { return const_reference(container, x, y, z, w); }
+  const_reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  const_iterator operator+(difference_type d) const { const_iterator it = *this; it.advance(d); return it; }
+  const_iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const const_iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const const_iterator& it) const { return container == it.container && x == it.x && y == it.y && z == it.z && w == it.w; }
+  bool operator!=(const const_iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const const_iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const const_iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const const_iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const const_iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  const_iterator& operator++() { increment(); return *this; }
+  const_iterator& operator--() { decrement(); return *this; }
+  const_iterator operator++(int) { const_iterator it = *this; increment(); return it; }
+  const_iterator operator--(int) { const_iterator it = *this; decrement(); return it; }
+  const_iterator operator+=(difference_type d) { advance(+d); return *this; }
+  const_iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+  // local container index of value referenced by iterator
+  size_t i() const { return x - container->min_x(); }
+  size_t j() const { return y - container->min_y(); }
+  size_t k() const { return z - container->min_z(); }
+  size_t l() const { return w - container->min_w(); }
+
+protected:
+  // sequential offset associated with index (x, y, z, w) plus delta d
+  difference_type offset(difference_type d = 0) const
+  {
+    difference_type p = d;
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t wmin = container->min_w();
+    size_t wmax = container->max_w();
+    size_t nx = xmax - xmin;
+    size_t ny = ymax - ymin;
+    size_t nz = zmax - zmin;
+    size_t nw = wmax - wmin;
+    if (w == wmax)
+      p += nx * ny * nz * nw;
+    else {
+      size_t m = ~size_t(3);
+      size_t bw = std::max(w & m, wmin); size_t sw = std::min((bw + 4) & m, wmax) - bw; p += (bw - wmin) * nx * ny * nz;
+      size_t bz = std::max(z & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p += (bz - zmin) * nx * ny * sw;
+      size_t by = std::max(y & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p += (by - ymin) * nx * sz * sw;
+      size_t bx = std::max(x & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p += (bx - xmin) * sy * sz * sw;
+      p += (w - bw) * sx * sy * sz;
+      p += (z - bz) * sx * sy;
+      p += (y - by) * sx;
+      p += (x - bx);
+    }
+    return p;
+  }
+
+  // index (x, y, z, w) associated with sequential offset p
+  void index(size_t& x, size_t& y, size_t& z, size_t& w, difference_type p) const
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t wmin = container->min_w();
+    size_t wmax = container->max_w();
+    size_t nx = xmax - xmin;
+    size_t ny = ymax - ymin;
+    size_t nz = zmax - zmin;
+    size_t nw = wmax - wmin;
+    if (size_t(p) == nx * ny * nz * nw) {
+      x = xmin;
+      y = ymin;
+      z = zmin;
+      w = wmax;
+    }
+    else {
+      size_t m = ~size_t(3);
+      size_t bw = std::max((wmin + size_t(p / ptrdiff_t(nx * ny * nz))) & m, wmin); size_t sw = std::min((bw + 4) & m, wmax) - bw; p -= (bw - wmin) * nx * ny * nz;
+      size_t bz = std::max((zmin + size_t(p / ptrdiff_t(nx * ny * sw))) & m, zmin); size_t sz = std::min((bz + 4) & m, zmax) - bz; p -= (bz - zmin) * nx * ny * sw;
+      size_t by = std::max((ymin + size_t(p / ptrdiff_t(nx * sz * sw))) & m, ymin); size_t sy = std::min((by + 4) & m, ymax) - by; p -= (by - ymin) * nx * sz * sw;
+      size_t bx = std::max((xmin + size_t(p / ptrdiff_t(sy * sz * sw))) & m, xmin); size_t sx = std::min((bx + 4) & m, xmax) - bx; p -= (bx - xmin) * sy * sz * sw;
+      w = bw + size_t(p / ptrdiff_t(sx * sy * sz)); p -= (w - bw) * sx * sy * sz;
+      z = bz + size_t(p / ptrdiff_t(sx * sy));      p -= (z - bz) * sx * sy;
+      y = by + size_t(p / ptrdiff_t(sx));           p -= (y - by) * sx;
+      x = bx + size_t(p);                           p -= (x - bx);
+    }
+  }
+
+  // advance iterator by d
+  void advance(difference_type d) { index(x, y, z, w, offset(d)); }
+
+  // increment iterator to next element
+  void increment()
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t wmin = container->min_w();
+    size_t wmax = container->max_w();
+    size_t m = ~size_t(3);
+    ++x;
+    if (!(x & 3u) || x == xmax) {
+      x = std::max((x - 1) & m, xmin);
+      ++y;
+      if (!(y & 3u) || y == ymax) {
+        y = std::max((y - 1) & m, ymin);
+        ++z;
+        if (!(z & 3u) || z == zmax) {
+          z = std::max((z - 1) & m, zmin);
+          ++w;
+          if (!(w & 3u) || w == wmax) {
+            w = std::max((w - 1) & m, wmin);
+            // done with block; advance to next
+            x = (x + 4) & m;
+            if (x >= xmax) {
+              x = xmin;
+              y = (y + 4) & m;
+              if (y >= ymax) {
+                y = ymin;
+                z = (z + 4) & m;
+                if (z >= zmax) {
+                  z = zmin;
+                  w = (w + 4) & m;
+                  if (w >= wmax)
+                    w = wmax;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // decrement iterator to previous element
+  void decrement()
+  {
+    size_t xmin = container->min_x();
+    size_t xmax = container->max_x();
+    size_t ymin = container->min_y();
+    size_t ymax = container->max_y();
+    size_t zmin = container->min_z();
+    size_t zmax = container->max_z();
+    size_t wmin = container->min_w();
+    size_t wmax = container->max_w();
+    size_t m = ~size_t(3);
+    if (w == wmax) {
+      x = xmax - 1;
+      y = ymax - 1;
+      z = zmax - 1;
+      w = wmax - 1;
+    }
+    else {
+      if (!(x & 3u) || x == xmin) {
+        x = std::min((x + 4) & m, xmax);
+        if (!(y & 3u) || y == ymin) {
+          y = std::min((y + 4) & m, ymax);
+          if (!(z & 3u) || z == zmin) {
+            z = std::min((z + 4) & m, zmax);
+            if (!(w & 3u) || w == wmin) {
+              w = std::min((w + 4) & m, wmax);
+              // done with block; advance to next
+              x = (x - 1) & m;
+              if (x <= xmin) {
+                x = xmax;
+                y = (y - 1) & m;
+                if (y <= ymin) {
+                  y = ymax;
+                  z = (z - 1) & m;
+                  if (z <= zmin) {
+                    z = zmax;
+                    w = (w - 1) & m;
+                    if (w <= wmin)
+                      w = wmin;
+                  }
+                }
+              }
+            }
+            --w;
+          }
+          --z;
+        }
+        --y;
+      }
+      --x;
+    }
+  }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+  using const_handle<Container>::z;
+  using const_handle<Container>::w;
+};
+
+// random access iterator that visits 4D array or view block by block
+template <class Container>
+class iterator : public const_iterator<Container> {
+public:
+  // typedefs for STL compatibility
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef ptrdiff_t difference_type;
+  typedef zfp::internal::dim4::reference<Container> reference;
+  typedef zfp::internal::dim4::pointer<Container> pointer;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  // default constructor
+  iterator() : const_iterator<Container>(0, 0, 0, 0, 0) {}
+
+  // constructor
+  explicit iterator(container_type* container, size_t x, size_t y, size_t z, size_t w) : const_iterator<Container>(container, x, y, z, w) {}
+
+  // dereference iterator
+  reference operator*() const { return reference(container, x, y, z, w); }
+  reference operator[](difference_type d) const { return *operator+(d); }
+
+  // iterator arithmetic
+  iterator operator+(difference_type d) const { iterator it = *this; it.advance(d); return it; }
+  iterator operator-(difference_type d) const { return operator+(-d); }
+  difference_type operator-(const iterator& it) const { return offset() - it.offset(); }
+
+  // equality operators
+  bool operator==(const iterator& it) const { return container == it.container && x == it.x && y == it.y && z == it.z && w == it.w; }
+  bool operator!=(const iterator& it) const { return !operator==(it); }
+
+  // relational operators
+  bool operator<=(const iterator& it) const { return container == it.container && offset() <= it.offset(); }
+  bool operator>=(const iterator& it) const { return container == it.container && offset() >= it.offset(); }
+  bool operator<(const iterator& it) const { return container == it.container && offset() < it.offset(); }
+  bool operator>(const iterator& it) const { return container == it.container && offset() > it.offset(); }
+
+  // increment and decrement
+  iterator& operator++() { increment(); return *this; }
+  iterator& operator--() { decrement(); return *this; }
+  iterator operator++(int) { iterator it = *this; increment(); return it; }
+  iterator operator--(int) { iterator it = *this; decrement(); return it; }
+  iterator operator+=(difference_type d) { advance(+d); return *this; }
+  iterator operator-=(difference_type d) { advance(-d); return *this; }
+
+protected:
+  using const_iterator<Container>::offset;
+  using const_iterator<Container>::advance;
+  using const_iterator<Container>::increment;
+  using const_iterator<Container>::decrement;
+  using const_iterator<Container>::container;
+  using const_iterator<Container>::x;
+  using const_iterator<Container>::y;
+  using const_iterator<Container>::z;
+  using const_iterator<Container>::w;
+};
+
+} // dim4
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/memory.hpp b/include/zfp/internal/array/memory.hpp
new file mode 100644
index 00000000..b6e7b9f6
--- /dev/null
+++ b/include/zfp/internal/array/memory.hpp
@@ -0,0 +1,200 @@
+#ifndef ZFP_MEMORY_HPP
+#define ZFP_MEMORY_HPP
+
+// Memory management for POD types only.  Templated functions are provided only
+// to avoid the need for casts to/from void* in pass-by-reference calls.
+
+#ifdef _WIN32
+extern "C" {
+  #ifdef __MINGW32__
+    #include <x86intrin.h>
+  #endif
+
+  #include <malloc.h>
+}
+#endif
+
+#include <algorithm>
+#include <cstdlib>
+#include <stdexcept>
+
+// byte alignment of compressed data
+#ifndef ZFP_MEMORY_ALIGNMENT
+  #define ZFP_MEMORY_ALIGNMENT 0x100u
+#endif
+
+#define unused_(x) ((void)(x))
+
+namespace zfp {
+namespace internal {
+
+// allocate size bytes
+inline void*
+allocate(size_t size)
+{
+  void* ptr = std::malloc(size);
+  if (!ptr)
+    throw std::bad_alloc();
+  return ptr;
+}
+
+// allocate size bytes with suggested alignment
+inline void*
+allocate_aligned(size_t size, size_t alignment)
+{
+  void* ptr = 0;
+
+#ifdef ZFP_WITH_ALIGNED_ALLOC
+  #if defined(__INTEL_COMPILER)
+    ptr = _mm_malloc(size, alignment);
+  #elif defined(__MINGW32__)
+    // require: alignment is an integer power of two
+    ptr = __mingw_aligned_malloc(size, alignment);
+  #elif defined(_WIN32)
+    // require: alignment is an integer power of two
+    ptr = _aligned_malloc(size, alignment);
+  #elif defined(__MACH__) || (_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600)
+    // require: alignment is an integer power of two >= sizeof(void*)
+    posix_memalign(&ptr, alignment, size);
+  #else
+    // aligned allocation not supported; fall back on unaligned allocation
+    unused_(alignment);
+    ptr = allocate(size);
+  #endif
+#else
+  // aligned allocation not enabled; use unaligned allocation
+  unused_(alignment);
+  ptr = allocate(size);
+#endif
+
+  if (!ptr)
+    throw std::bad_alloc();
+
+  return ptr;
+}
+
+// deallocate memory pointed to by ptr
+inline void
+deallocate(void* ptr)
+{
+  std::free(ptr);
+}
+
+// deallocate aligned memory pointed to by ptr
+inline void
+deallocate_aligned(void* ptr)
+{
+  if (!ptr)
+    return;
+#ifdef ZFP_WITH_ALIGNED_ALLOC
+  #ifdef __INTEL_COMPILER
+    _mm_free(ptr);
+  #elif defined(__MINGW32__)
+    __mingw_aligned_free(ptr);
+  #elif defined(_WIN32)
+    _aligned_free(ptr);
+  #else
+    std::free(ptr);
+  #endif
+#else
+  std::free(ptr);
+#endif
+}
+
+// reallocate buffer to size bytes
+template <typename T>
+inline void
+reallocate(T*& ptr, size_t size, bool preserve = false)
+{
+  if (preserve)
+    ptr = static_cast<T*>(std::realloc(ptr, size));
+  else {
+    zfp::internal::deallocate(ptr);
+    ptr = static_cast<T*>(zfp::internal::allocate(size));
+  }
+}
+
+// reallocate buffer to new_size bytes with suggested alignment
+template <typename T>
+inline void
+reallocate_aligned(T*& ptr, size_t new_size, size_t alignment, size_t old_size = 0)
+{
+  void* p = ptr;
+  reallocate_aligned(p, new_size, alignment, old_size);
+  ptr = static_cast<T*>(p);
+}
+
+// untyped reallocate buffer to new_size bytes with suggested alignment
+template <>
+inline void
+reallocate_aligned(void*& ptr, size_t new_size, size_t alignment, size_t old_size)
+{
+  if (old_size) {
+    // reallocate while preserving contents
+    void* dst = zfp::internal::allocate_aligned(new_size, alignment);
+    std::memcpy(dst, ptr, std::min(old_size, new_size));
+    zfp::internal::deallocate_aligned(ptr);
+    ptr = dst;
+  }
+  else {
+    // reallocate without preserving contents
+    zfp::internal::deallocate_aligned(ptr);
+    ptr = zfp::internal::allocate_aligned(new_size, alignment);
+  }
+}
+
+// clone array 'T src[count]' to dst
+template <typename T>
+inline void
+clone(T*& dst, const T* src, size_t count)
+{
+  zfp::internal::deallocate(dst);
+  if (src) {
+    dst = static_cast<T*>(zfp::internal::allocate(count * sizeof(T)));
+    std::copy(src, src + count, dst);
+  }
+  else
+    dst = 0;
+}
+
+// clone array 'T src[count]' to dst with suggested alignment
+template <typename T>
+inline void
+clone_aligned(T*& dst, const T* src, size_t count, size_t alignment)
+{
+  void* d = dst;
+  const void* s = src;
+  clone_aligned(d, s, count * sizeof(T), alignment);
+  dst = static_cast<T*>(d);
+  src = static_cast<const T*>(s);
+}
+
+// untyped, aligned clone of size bytes
+template <>
+inline void
+clone_aligned(void*& dst, const void* src, size_t size, size_t alignment)
+{
+  zfp::internal::deallocate_aligned(dst);
+  if (src) {
+    dst = zfp::internal::allocate_aligned(size, alignment);
+    std::memcpy(dst, src, size);
+  }
+  else
+    dst = 0;
+}
+
+// return smallest multiple of unit greater than or equal to size
+inline size_t
+round_up(size_t size, size_t unit)
+{
+  size += unit - 1;
+  size -= size % unit;
+  return size;
+}
+
+}
+}
+
+#undef unused_
+
+#endif
diff --git a/include/zfp/internal/array/pointer1.hpp b/include/zfp/internal/array/pointer1.hpp
new file mode 100644
index 00000000..37876c69
--- /dev/null
+++ b/include/zfp/internal/array/pointer1.hpp
@@ -0,0 +1,118 @@
+#ifndef ZFP_POINTER1_HPP
+#define ZFP_POINTER1_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim1 {
+
+// const pointer to a 1D array or view element
+template <class Container>
+class const_pointer : public const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  const_pointer() : const_handle<Container>(0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  const_pointer(std::nullptr_t) : const_handle<Container>(0, 0) {}
+#endif
+
+  // constructor
+  explicit const_pointer(const container_type* container, size_t x) : const_handle<Container>(container, x) {}
+
+  // dereference pointer
+  const_reference<Container> operator*() const { return const_reference<Container>(container, x); }
+  const_reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  const_pointer operator+(ptrdiff_t d) const { const_pointer p = *this; p.advance(d); return p; }
+  const_pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const const_pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const const_pointer& p) const { return container == p.container && x == p.x; }
+  bool operator!=(const const_pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const const_pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const const_pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const const_pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const const_pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  const_pointer& operator++() { increment(); return *this; }
+  const_pointer& operator--() { decrement(); return *this; }
+  const_pointer operator++(int) { const_pointer p = *this; increment(); return p; }
+  const_pointer operator--(int) { const_pointer p = *this; decrement(); return p; }
+  const_pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  const_pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x()) + d; }
+  void index(size_t& x, ptrdiff_t p) const { x = container->min_x() + size_t(p); }
+  void advance(ptrdiff_t d) { index(x, offset(d)); }
+  void increment() { ++x; }
+  void decrement() { --x; }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+};
+
+// pointer to a 1D array or view element
+template <class Container>
+class pointer : public const_pointer<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  pointer() : const_pointer<Container>(0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  pointer(std::nullptr_t) : const_pointer<Container>(0, 0) {}
+#endif
+
+  // constructor
+  explicit pointer(container_type* container, size_t x) : const_pointer<Container>(container, x) {}
+
+  // dereference pointer
+  reference<Container> operator*() const { return reference<Container>(container, x); }
+  reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p.advance(d); return p; }
+  pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const pointer& p) const { return container == p.container && x == p.x; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  using const_pointer<Container>::offset;
+  using const_pointer<Container>::advance;
+  using const_pointer<Container>::increment;
+  using const_pointer<Container>::decrement;
+  using const_pointer<Container>::container;
+  using const_pointer<Container>::x;
+};
+
+} // dim1
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/pointer2.hpp b/include/zfp/internal/array/pointer2.hpp
new file mode 100644
index 00000000..a074be98
--- /dev/null
+++ b/include/zfp/internal/array/pointer2.hpp
@@ -0,0 +1,136 @@
+#ifndef ZFP_POINTER2_HPP
+#define ZFP_POINTER2_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim2 {
+
+// const pointer to a 2D array or view element
+template <class Container>
+class const_pointer : public const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  const_pointer() : const_handle<Container>(0, 0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  const_pointer(std::nullptr_t) : const_handle<Container>(0, 0, 0) {}
+#endif
+
+  // constructor
+  explicit const_pointer(const container_type* container, size_t x, size_t y) : const_handle<Container>(container, x, y) {}
+
+  // dereference pointer
+  const_reference<Container> operator*() const { return const_reference<Container>(container, x, y); }
+  const_reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  const_pointer operator+(ptrdiff_t d) const { const_pointer p = *this; p.advance(d); return p; }
+  const_pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const const_pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const const_pointer& p) const { return container == p.container && x == p.x && y == p.y; }
+  bool operator!=(const const_pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const const_pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const const_pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const const_pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const const_pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  const_pointer& operator++() { increment(); return *this; }
+  const_pointer& operator--() { decrement(); return *this; }
+  const_pointer operator++(int) { const_pointer p = *this; increment(); return p; }
+  const_pointer operator--(int) { const_pointer p = *this; decrement(); return p; }
+  const_pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  const_pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x() + container->size_x() * (y - container->min_y())) + d; }
+  void index(size_t& x, size_t& y, ptrdiff_t p) const
+  {
+    x = container->min_x() + size_t(p % ptrdiff_t(container->size_x())); p /= container->size_x();
+    y = container->min_y() + size_t(p);
+  }
+  void advance(ptrdiff_t d) { index(x, y, offset(d)); }
+  void increment()
+  {
+    if (++x == container->max_x()) {
+      x = container->min_x();
+      ++y;
+    }
+  }
+  void decrement()
+  {
+    if (x-- == container->min_x()) {
+      x += container->size_x();
+      --y;
+    }
+  }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+};
+
+// pointer to a 2D array or view element
+template <class Container>
+class pointer : public const_pointer<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  pointer() : const_pointer<Container>(0, 0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  pointer(std::nullptr_t) : const_pointer<Container>(0, 0, 0) {}
+#endif
+
+  // constructor
+  explicit pointer(container_type* container, size_t x, size_t y) : const_pointer<Container>(container, x, y) {}
+
+  // dereference pointer
+  reference<Container> operator*() const { return reference<Container>(container, x, y); }
+  reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p.advance(d); return p; }
+  pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const pointer& p) const { return container == p.container && x == p.x && y == p.y; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  using const_pointer<Container>::offset;
+  using const_pointer<Container>::advance;
+  using const_pointer<Container>::increment;
+  using const_pointer<Container>::decrement;
+  using const_pointer<Container>::container;
+  using const_pointer<Container>::x;
+  using const_pointer<Container>::y;
+};
+
+} // dim2
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/pointer3.hpp b/include/zfp/internal/array/pointer3.hpp
new file mode 100644
index 00000000..8f8dee61
--- /dev/null
+++ b/include/zfp/internal/array/pointer3.hpp
@@ -0,0 +1,145 @@
+#ifndef ZFP_POINTER3_HPP
+#define ZFP_POINTER3_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim3 {
+
+// const pointer to a 3D array or view element
+template <class Container>
+class const_pointer : public const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  const_pointer() : const_handle<Container>(0, 0, 0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  const_pointer(std::nullptr_t) : const_handle<Container>(0, 0, 0, 0) {}
+#endif
+
+  // constructor
+  explicit const_pointer(const container_type* container, size_t x, size_t y, size_t z) : const_handle<Container>(container, x, y, z) {}
+
+  // dereference pointer
+  const_reference<Container> operator*() const { return const_reference<Container>(container, x, y, z); }
+  const_reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  const_pointer operator+(ptrdiff_t d) const { const_pointer p = *this; p.advance(d); return p; }
+  const_pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const const_pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const const_pointer& p) const { return container == p.container && x == p.x && y == p.y && z == p.z; }
+  bool operator!=(const const_pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const const_pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const const_pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const const_pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const const_pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  const_pointer& operator++() { increment(); return *this; }
+  const_pointer& operator--() { decrement(); return *this; }
+  const_pointer operator++(int) { const_pointer p = *this; increment(); return p; }
+  const_pointer operator--(int) { const_pointer p = *this; decrement(); return p; }
+  const_pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  const_pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x() + container->size_x() * (y - container->min_y() + container->size_y() * (z - container->min_z()))) + d; }
+  void index(size_t& x, size_t& y, size_t& z, ptrdiff_t p) const
+  {
+    x = container->min_x() + size_t(p % ptrdiff_t(container->size_x())); p /= container->size_x();
+    y = container->min_y() + size_t(p % ptrdiff_t(container->size_y())); p /= container->size_y();
+    z = container->min_z() + size_t(p);
+  }
+  void advance(ptrdiff_t d) { index(x, y, z, offset(d)); }
+  void increment()
+  {
+    if (++x == container->max_x()) {
+      x = container->min_x();
+      if (++y == container->max_y()) {
+        y = container->min_y();
+        ++z;
+      }
+    }
+  }
+  void decrement()
+  {
+    if (x-- == container->min_x()) {
+      x += container->size_x();
+      if (y-- == container->min_y()) {
+        y += container->size_y();
+        --z;
+      }
+    }
+  }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+  using const_handle<Container>::z;
+};
+
+// pointer to a 3D array or view element
+template <class Container>
+class pointer : public const_pointer<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  pointer() : const_pointer<Container>(0, 0, 0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  pointer(std::nullptr_t) : const_pointer<Container>(0, 0, 0, 0) {}
+#endif
+
+  // constructor
+  explicit pointer(container_type* container, size_t x, size_t y, size_t z) : const_pointer<Container>(container, x, y, z) {}
+
+  // dereference pointer
+  reference<Container> operator*() const { return reference<Container>(container, x, y, z); }
+  reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p.advance(d); return p; }
+  pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const pointer& p) const { return container == p.container && x == p.x && y == p.y && z == p.z; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  using const_pointer<Container>::offset;
+  using const_pointer<Container>::advance;
+  using const_pointer<Container>::increment;
+  using const_pointer<Container>::decrement;
+  using const_pointer<Container>::container;
+  using const_pointer<Container>::x;
+  using const_pointer<Container>::y;
+  using const_pointer<Container>::z;
+};
+
+} // dim3
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/pointer4.hpp b/include/zfp/internal/array/pointer4.hpp
new file mode 100644
index 00000000..8adb97f3
--- /dev/null
+++ b/include/zfp/internal/array/pointer4.hpp
@@ -0,0 +1,154 @@
+#ifndef ZFP_POINTER4_HPP
+#define ZFP_POINTER4_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim4 {
+
+// const pointer to a 4D array or view element
+template <class Container>
+class const_pointer : public const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  const_pointer() : const_handle<Container>(0, 0, 0, 0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  const_pointer(std::nullptr_t) : const_handle<Container>(0, 0, 0, 0, 0) {}
+#endif
+
+  // constructor
+  explicit const_pointer(const container_type* container, size_t x, size_t y, size_t z, size_t w) : const_handle<Container>(container, x, y, z, w) {}
+
+  // dereference pointer
+  const_reference<Container> operator*() const { return const_reference<Container>(container, x, y, z, w); }
+  const_reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  const_pointer operator+(ptrdiff_t d) const { const_pointer p = *this; p.advance(d); return p; }
+  const_pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const const_pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const const_pointer& p) const { return container == p.container && x == p.x && y == p.y && z == p.z && w == p.w; }
+  bool operator!=(const const_pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const const_pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const const_pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const const_pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const const_pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  const_pointer& operator++() { increment(); return *this; }
+  const_pointer& operator--() { decrement(); return *this; }
+  const_pointer operator++(int) { const_pointer p = *this; increment(); return p; }
+  const_pointer operator--(int) { const_pointer p = *this; decrement(); return p; }
+  const_pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  const_pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  ptrdiff_t offset(ptrdiff_t d = 0) const { return static_cast<ptrdiff_t>(x - container->min_x() + container->size_x() * (y - container->min_y() + container->size_y() * (z - container->min_z() + container->size_z() * (w - container->min_w())))) + d; }
+  void index(size_t& x, size_t& y, size_t& z, size_t & w, ptrdiff_t p) const
+  {
+    x = container->min_x() + size_t(p % ptrdiff_t(container->size_x())); p /= container->size_x();
+    y = container->min_y() + size_t(p % ptrdiff_t(container->size_y())); p /= container->size_y();
+    z = container->min_z() + size_t(p % ptrdiff_t(container->size_z())); p /= container->size_z();
+    w = container->min_w() + size_t(p);
+  }
+  void advance(ptrdiff_t d) { index(x, y, z, w, offset(d)); }
+  void increment()
+  {
+    if (++x == container->max_x()) {
+      x = container->min_x();
+      if (++y == container->max_y()) {
+        y = container->min_y();
+        if (++z == container->max_z()) {
+          z = container->min_z();
+          ++w;
+        }
+      }
+    }
+  }
+  void decrement()
+  {
+    if (x-- == container->min_x()) {
+      x += container->size_x();
+      if (y-- == container->min_y()) {
+        y += container->size_y();
+        if (z-- == container->min_z()) {
+          z += container->size_z();
+          --w;
+        }
+      }
+    }
+  }
+
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+  using const_handle<Container>::z;
+  using const_handle<Container>::w;
+};
+
+// pointer to a 4D array or view element
+template <class Container>
+class pointer : public const_pointer<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // default constructor
+  pointer() : const_pointer<Container>(0, 0, 0, 0, 0) {}
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  pointer(std::nullptr_t) : const_pointer<Container>(0, 0, 0, 0, 0) {}
+#endif
+
+  // constructor
+  explicit pointer(container_type* container, size_t x, size_t y, size_t z, size_t w) : const_pointer<Container>(container, x, y, z, w) {}
+
+  // dereference pointer
+  reference<Container> operator*() const { return reference<Container>(container, x, y, z, w); }
+  reference<Container> operator[](ptrdiff_t d) const { return *operator+(d); }
+
+  // pointer arithmetic
+  pointer operator+(ptrdiff_t d) const { pointer p = *this; p.advance(d); return p; }
+  pointer operator-(ptrdiff_t d) const { return operator+(-d); }
+  ptrdiff_t operator-(const pointer& p) const { return offset() - p.offset(); }
+
+  // equality operators
+  bool operator==(const pointer& p) const { return container == p.container && x == p.x && y == p.y && z == p.z && w == p.w; }
+  bool operator!=(const pointer& p) const { return !operator==(p); }
+
+  // relational operators
+  bool operator<=(const pointer& p) const { return container == p.container && offset() <= p.offset(); }
+  bool operator>=(const pointer& p) const { return container == p.container && offset() >= p.offset(); }
+  bool operator<(const pointer& p) const { return container == p.container && offset() < p.offset(); }
+  bool operator>(const pointer& p) const { return container == p.container && offset() > p.offset(); }
+
+  // increment and decrement
+  pointer& operator++() { increment(); return *this; }
+  pointer& operator--() { decrement(); return *this; }
+  pointer operator++(int) { pointer p = *this; increment(); return p; }
+  pointer operator--(int) { pointer p = *this; decrement(); return p; }
+  pointer operator+=(ptrdiff_t d) { advance(+d); return *this; }
+  pointer operator-=(ptrdiff_t d) { advance(-d); return *this; }
+
+protected:
+  using const_pointer<Container>::offset;
+  using const_pointer<Container>::advance;
+  using const_pointer<Container>::increment;
+  using const_pointer<Container>::decrement;
+  using const_pointer<Container>::container;
+  using const_pointer<Container>::x;
+  using const_pointer<Container>::y;
+  using const_pointer<Container>::z;
+  using const_pointer<Container>::w;
+};
+
+} // dim4
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/reference1.hpp b/include/zfp/internal/array/reference1.hpp
new file mode 100644
index 00000000..e41cc8b5
--- /dev/null
+++ b/include/zfp/internal/array/reference1.hpp
@@ -0,0 +1,78 @@
+#ifndef ZFP_REFERENCE1_HPP
+#define ZFP_REFERENCE1_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim1 {
+
+// const reference to a 1D array or view element
+template <class Container>
+class const_reference : const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit const_reference(const container_type* container, size_t x) : const_handle<Container>(container, x) {}
+
+  // inspector
+  operator value_type() const { return get(); }
+
+  // pointer to referenced element
+  const_pointer<Container> operator&() const { return const_pointer<Container>(container, x); }
+
+protected:
+  using const_handle<Container>::get;
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+};
+
+// reference to a 1D array or view element
+template <class Container>
+class reference : public const_reference<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit reference(container_type* container, size_t x) : const_reference<Container>(container, x) {}
+
+  // copy constructor (to satisfy rule of three)
+  reference(const reference& r) : const_reference<Container>(r.container, r.x) {}
+
+  // assignment
+  reference operator=(const reference& r) { set(r.get()); return *this; }
+  reference operator=(value_type val) { set(val); return *this; }
+
+  // compound assignment
+  reference operator+=(value_type val) { container->add(x, val); return *this; }
+  reference operator-=(value_type val) { container->sub(x, val); return *this; }
+  reference operator*=(value_type val) { container->mul(x, val); return *this; }
+  reference operator/=(value_type val) { container->div(x, val); return *this; }
+
+  // pointer to referenced element
+  pointer<Container> operator&() const { return pointer<Container>(container, x); }
+
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    value_type x = a.get();
+    value_type y = b.get();
+    b.set(x);
+    a.set(y);
+  }
+
+protected:
+  // assign value through reference
+  void set(value_type val) { container->set(x, val); }
+
+  using const_reference<Container>::get;
+  using const_reference<Container>::container;
+  using const_reference<Container>::x;
+};
+
+} // dim1
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/reference2.hpp b/include/zfp/internal/array/reference2.hpp
new file mode 100644
index 00000000..b16484fb
--- /dev/null
+++ b/include/zfp/internal/array/reference2.hpp
@@ -0,0 +1,80 @@
+#ifndef ZFP_REFERENCE2_HPP
+#define ZFP_REFERENCE2_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim2 {
+
+// const reference to a 2D array or view element
+template <class Container>
+class const_reference : const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit const_reference(const container_type* container, size_t x, size_t y) : const_handle<Container>(container, x, y) {}
+
+  // inspector
+  operator value_type() const { return get(); }
+
+  // pointer to referenced element
+  const_pointer<Container> operator&() const { return const_pointer<Container>(container, x, y); }
+
+protected:
+  using const_handle<Container>::get;
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+};
+
+// reference to a 2D array or view element
+template <class Container>
+class reference : public const_reference<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit reference(container_type* container, size_t x, size_t y) : const_reference<Container>(container, x, y) {}
+
+  // copy constructor (to satisfy rule of three)
+  reference(const reference& r) : const_reference<Container>(r.container, r.x, r.y) {}
+
+  // assignment
+  reference operator=(const reference& r) { set(r.get()); return *this; }
+  reference operator=(value_type val) { set(val); return *this; }
+
+  // compound assignment
+  reference operator+=(value_type val) { container->add(x, y, val); return *this; }
+  reference operator-=(value_type val) { container->sub(x, y, val); return *this; }
+  reference operator*=(value_type val) { container->mul(x, y, val); return *this; }
+  reference operator/=(value_type val) { container->div(x, y, val); return *this; }
+
+  // pointer to referenced element
+  pointer<Container> operator&() const { return pointer<Container>(container, x, y); }
+
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    value_type x = a.get();
+    value_type y = b.get();
+    b.set(x);
+    a.set(y);
+  }
+
+protected:
+  // assign value through reference
+  void set(value_type val) { container->set(x, y, val); }
+
+  using const_reference<Container>::get;
+  using const_reference<Container>::container;
+  using const_reference<Container>::x;
+  using const_reference<Container>::y;
+};
+
+} // dim2
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/reference3.hpp b/include/zfp/internal/array/reference3.hpp
new file mode 100644
index 00000000..ecb52d30
--- /dev/null
+++ b/include/zfp/internal/array/reference3.hpp
@@ -0,0 +1,82 @@
+#ifndef ZFP_REFERENCE3_HPP
+#define ZFP_REFERENCE3_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim3 {
+
+// const reference to a 3D array or view element
+template <class Container>
+class const_reference : const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit const_reference(const container_type* container, size_t x, size_t y, size_t z) : const_handle<Container>(container, x, y, z) {}
+
+  // inspector
+  operator value_type() const { return get(); }
+
+  // pointer to referenced element
+  const_pointer<Container> operator&() const { return const_pointer<Container>(container, x, y, z); }
+
+protected:
+  using const_handle<Container>::get;
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+  using const_handle<Container>::z;
+};
+
+// reference to a 3D array or view element
+template <class Container>
+class reference : public const_reference<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit reference(container_type* container, size_t x, size_t y, size_t z) : const_reference<Container>(container, x, y, z) {}
+
+  // copy constructor (to satisfy rule of three)
+  reference(const reference& r) : const_reference<Container>(r.container, r.x, r.y, r.z) {}
+
+  // assignment
+  reference operator=(const reference& r) { set(r.get()); return *this; }
+  reference operator=(value_type val) { set(val); return *this; }
+
+  // compound assignment
+  reference operator+=(value_type val) { container->add(x, y, z, val); return *this; }
+  reference operator-=(value_type val) { container->sub(x, y, z, val); return *this; }
+  reference operator*=(value_type val) { container->mul(x, y, z, val); return *this; }
+  reference operator/=(value_type val) { container->div(x, y, z, val); return *this; }
+
+  // pointer to referenced element
+  pointer<Container> operator&() const { return pointer<Container>(container, x, y, z); }
+
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    value_type x = a.get();
+    value_type y = b.get();
+    b.set(x);
+    a.set(y);
+  }
+
+protected:
+  // assign value through reference
+  void set(value_type val) { container->set(x, y, z, val); }
+
+  using const_reference<Container>::get;
+  using const_reference<Container>::container;
+  using const_reference<Container>::x;
+  using const_reference<Container>::y;
+  using const_reference<Container>::z;
+};
+
+} // dim3
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/reference4.hpp b/include/zfp/internal/array/reference4.hpp
new file mode 100644
index 00000000..1d0c3ca3
--- /dev/null
+++ b/include/zfp/internal/array/reference4.hpp
@@ -0,0 +1,84 @@
+#ifndef ZFP_REFERENCE4_HPP
+#define ZFP_REFERENCE4_HPP
+
+namespace zfp {
+namespace internal {
+namespace dim4 {
+
+// const reference to a 4D array or view element
+template <class Container>
+class const_reference : const_handle<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit const_reference(const container_type* container, size_t x, size_t y, size_t z, size_t w) : const_handle<Container>(container, x, y, z, w) {}
+
+  // inspector
+  operator value_type() const { return get(); }
+
+  // pointer to referenced element
+  const_pointer<Container> operator&() const { return const_pointer<Container>(container, x, y, z, w); }
+
+protected:
+  using const_handle<Container>::get;
+  using const_handle<Container>::container;
+  using const_handle<Container>::x;
+  using const_handle<Container>::y;
+  using const_handle<Container>::z;
+  using const_handle<Container>::w;
+};
+
+// reference to a 4D array or view element
+template <class Container>
+class reference : public const_reference<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // constructor
+  explicit reference(container_type* container, size_t x, size_t y, size_t z, size_t w) : const_reference<Container>(container, x, y, z, w) {}
+
+  // copy constructor (to satisfy rule of three)
+  reference(const reference& r) : const_reference<Container>(r.container, r.x, r.y, r.z, r.w) {}
+
+  // assignment
+  reference operator=(const reference& r) { set(r.get()); return *this; }
+  reference operator=(value_type val) { set(val); return *this; }
+
+  // compound assignment
+  reference operator+=(value_type val) { container->add(x, y, z, w, val); return *this; }
+  reference operator-=(value_type val) { container->sub(x, y, z, w, val); return *this; }
+  reference operator*=(value_type val) { container->mul(x, y, z, w, val); return *this; }
+  reference operator/=(value_type val) { container->div(x, y, z, w, val); return *this; }
+
+  // pointer to referenced element
+  pointer<Container> operator&() const { return pointer<Container>(container, x, y, z, w); }
+
+  // swap two array elements via proxy references
+  friend void swap(reference a, reference b)
+  {
+    value_type x = a.get();
+    value_type y = b.get();
+    b.set(x);
+    a.set(y);
+  }
+
+protected:
+  // assign value through reference
+  void set(value_type val) { container->set(x, y, z, w, val); }
+
+  using const_reference<Container>::get;
+  using const_reference<Container>::container;
+  using const_reference<Container>::x;
+  using const_reference<Container>::y;
+  using const_reference<Container>::z;
+  using const_reference<Container>::w;
+};
+
+} // dim4
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/store.hpp b/include/zfp/internal/array/store.hpp
new file mode 100644
index 00000000..f649972f
--- /dev/null
+++ b/include/zfp/internal/array/store.hpp
@@ -0,0 +1,255 @@
+#ifndef ZFP_STORE_HPP
+#define ZFP_STORE_HPP
+
+#include <climits>
+#include <cmath>
+#include "zfp/internal/array/memory.hpp"
+
+namespace zfp {
+namespace internal {
+
+// base class for block store
+template <class Codec, class Index>
+class BlockStore {
+public:
+  // compression mode
+  zfp_mode mode() const { return codec.mode(); }
+
+  // rate in bits per value (fixed-rate mode only)
+  double rate() const { return codec.rate(); }
+
+  // precision in uncompressed bits per value (fixed-precision mode only)
+  uint precision() const { return codec.precision(); }
+
+  // accuracy as absolute error tolerance (fixed-accuracy mode only)
+  double accuracy() const { return codec.accuracy(); }
+
+  // compression parameters (all compression modes)
+  void params(uint* minbits, uint* maxbits, uint* maxprec, int* minexp) const { codec.params(minbits, maxbits, maxprec, minexp); }
+
+  // enable reversible (lossless) mode
+  void set_reversible()
+  {
+    set_variable_rate();
+    codec.set_reversible();
+    clear();
+  }
+
+  // set fixed rate in compressed bits per value with optional word alignment
+  double set_rate(double rate, bool align)
+  {
+    rate = codec.set_rate(rate, align);
+    uint maxbits;
+    codec.params(0, &maxbits, 0, 0);
+    index.set_block_size(maxbits);
+    alloc(true);
+    return rate;
+  }
+
+  // set precision in uncompressed bits per value
+  uint set_precision(uint precision)
+  {
+    set_variable_rate();
+    precision = codec.set_precision(precision);
+    clear();
+    return precision;
+  }
+
+  // set accuracy as absolute error tolerance
+  double set_accuracy(double tolerance)
+  {
+    set_variable_rate();
+    tolerance = codec.set_accuracy(tolerance);
+    clear();
+    return tolerance;
+  }
+
+  // set expert mode compression parameters
+  bool set_params(uint minbits, uint maxbits, uint maxprec, int minexp)
+  {
+    if (minbits != maxbits)
+      set_variable_rate();
+    bool status = codec.set_params(minbits, maxbits, maxprec, minexp);
+    clear();
+    return status;
+  }
+
+  // set compression mode and parameters
+  void set_config(const zfp_config& config)
+  {
+    switch (config.mode) {
+      case zfp_mode_reversible:
+        set_reversible();
+        break;
+      case zfp_mode_fixed_rate:
+        if (config.arg.rate < 0)
+          set_rate(-config.arg.rate, true);
+        else
+          set_rate(+config.arg.rate, false);
+        break;
+      case zfp_mode_fixed_precision:
+        set_precision(config.arg.precision);
+        break;
+      case zfp_mode_fixed_accuracy:
+        set_accuracy(config.arg.tolerance);
+        break;
+      case zfp_mode_expert:
+        set_params(config.arg.expert.minbits, config.arg.expert.maxbits, config.arg.expert.maxprec, config.arg.expert.minexp);
+        break;
+      default:
+        throw zfp::exception("zfp compression mode not supported by array");
+    }
+  }
+
+  // clear store and reallocate memory for buffer
+  void clear()
+  {
+    index.clear();
+    alloc(true);
+  }
+
+  // flush any buffered block index data
+  void flush() { index.flush(); }
+
+  // shrink buffer to match size of compressed data
+  void compact()
+  {
+    size_t size = zfp::internal::round_up(index.range(), codec.alignment() * CHAR_BIT) / CHAR_BIT;
+    if (bytes > size) {
+      codec.close();
+      zfp::internal::reallocate_aligned(data, size, ZFP_MEMORY_ALIGNMENT, bytes);
+      bytes = size;
+      codec.open(data, bytes);
+    }
+  }
+
+  // increment private view reference count (for thread safety)
+  void reference()
+  {
+#ifdef _OPENMP
+    #pragma omp critical(references)
+    {
+      references++;
+      codec.set_thread_safety(references > 1);
+    }
+#endif
+  }
+
+  // decrement private view reference count (for thread safety)
+  void unreference()
+  {
+#ifdef _OPENMP
+    #pragma omp critical(references)
+    {
+      references--;
+      codec.set_thread_safety(references > 1);
+    }
+#endif
+  }
+
+  // byte size of store data structure components indicated by mask
+  virtual size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += index.size_bytes(mask);
+    size += codec.size_bytes(mask);
+    if (mask & ZFP_DATA_PAYLOAD)
+      size += bytes;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this);
+    return size;
+  }
+
+  // number of bytes of compressed data
+  size_t compressed_size() const { return bytes; }
+
+  // pointer to compressed data for read or write access
+  void* compressed_data() const { return data; }
+
+protected:
+  // protected default constructor
+  BlockStore() :
+    data(0),
+    bytes(0),
+    references(0),
+    index(0)
+  {}
+
+  // destructor
+  virtual ~BlockStore() { free(); }
+
+  // buffer size in bytes needed for current codec settings
+  virtual size_t buffer_size() const = 0;
+
+  // number of elements per block
+  virtual size_t block_size() const = 0;
+
+  // total number of blocks
+  virtual size_t blocks() const = 0;
+
+  // ensure variable rate is supported
+  void set_variable_rate()
+  {
+    if (!index.has_variable_rate())
+      throw zfp::exception("zfp index does not support variable rate");
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockStore& s)
+  {
+    free();
+    zfp::internal::clone_aligned(data, s.data, s.bytes, ZFP_MEMORY_ALIGNMENT);
+    bytes = s.bytes;
+    references = s.references;
+    index = s.index;
+    codec = s.codec;
+    codec.open(data, bytes);
+  }
+
+  // allocate memory for block store
+  void alloc(bool clear)
+  {
+    free();
+    bytes = buffer_size();
+    zfp::internal::reallocate_aligned(data, bytes, ZFP_MEMORY_ALIGNMENT);
+    if (clear)
+      std::fill(static_cast<uchar*>(data), static_cast<uchar*>(data) + bytes, uchar(0));
+    codec.open(data, bytes);
+  }
+
+  // free block store
+  void free()
+  {
+    if (data) {
+      zfp::internal::deallocate_aligned(data);
+      data = 0;
+      bytes = 0;
+      codec.close();
+    }
+  }
+
+  // bit offset to block store
+  bitstream_offset offset(size_t block_index) const { return index.block_offset(block_index); }
+
+  // shape 0 <= m <= 3 of block containing index i, 0 <= i <= n - 1
+  static uint shape_code(size_t i, size_t n)
+  {
+    // handle partial blocks efficiently using no conditionals
+    size_t m = i ^ n;               // m < 4 iff partial block
+    m -= 4;                         // m < 0 iff partial block
+    m >>= CHAR_BIT * sizeof(m) - 2; // m = 3 iff partial block; otherwise m = 0
+    m &= -n;                        // m = 4 - w
+    return static_cast<uint>(m);
+  }
+
+  void* data;        // pointer to compressed blocks
+  size_t bytes;      // compressed data size
+  size_t references; // private view references to array (for thread safety)
+  Index index;       // block index (size and offset)
+  Codec codec;       // compression codec
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/store1.hpp b/include/zfp/internal/array/store1.hpp
new file mode 100644
index 00000000..aeb05fa8
--- /dev/null
+++ b/include/zfp/internal/array/store1.hpp
@@ -0,0 +1,140 @@
+#ifndef ZFP_STORE1_HPP
+#define ZFP_STORE1_HPP
+
+#include "zfp/internal/array/store.hpp"
+
+namespace zfp {
+namespace internal {
+
+// compressed block store for 1D array
+template <typename Scalar, class Codec, class Index>
+class BlockStore1 : public BlockStore<Codec, Index> {
+public:
+  // default constructor
+  BlockStore1() :
+    nx(0),
+    bx(0)
+  {}
+
+  // block store for array of size nx and given configuration
+  BlockStore1(size_t nx, const zfp_config& config)
+  {
+    set_size(nx);
+    this->set_config(config);
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockStore1& s)
+  {
+    free();
+    BlockStore<Codec, Index>::deep_copy(s);
+    nx = s.nx;
+    bx = s.bx;
+  }
+
+  // resize array
+  void resize(size_t nx, bool clear = true)
+  {
+    free();
+    set_size(nx);
+    if (blocks())
+      alloc(clear);
+  }
+
+  // byte size of store data structure components indicated by mask
+  virtual size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  {
+    size_t size = 0;
+    size += BlockStore<Codec, Index>::size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - sizeof(BlockStore<Codec, Index>);
+    return size;
+  }
+
+  // conservative buffer size 
+  virtual size_t buffer_size() const
+  {
+    zfp_field* field = zfp_field_1d(0, codec.type, nx);
+    size_t size = codec.buffer_size(field);
+    zfp_field_free(field);
+    return size;
+  }
+
+  // number of elements per block
+  virtual size_t block_size() const { return 4; }
+
+  // total number of blocks
+  virtual size_t blocks() const { return bx; }
+
+  // array size in blocks
+  size_t block_size_x() const { return bx; }
+
+  // flat block index for element i
+  size_t block_index(size_t i) const { return i / 4; }
+
+  // encoding of block dimensions
+  uint block_shape(size_t block_index) const
+  {
+    size_t i = 4 * block_index;
+    uint mx = shape_code(i, nx);
+    return mx;
+  }
+
+  // encode contiguous block with given index
+  size_t encode(size_t block_index, const Scalar* block)
+  {
+    size_t size = codec.encode_block(offset(block_index), block_shape(block_index), block);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // encode block with given index from strided array
+  size_t encode(size_t block_index, const Scalar* p, ptrdiff_t sx)
+  {
+    size_t size = codec.encode_block_strided(offset(block_index), block_shape(block_index), p, sx);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // decode contiguous block with given index
+  size_t decode(size_t block_index, Scalar* block) const
+  {
+    return codec.decode_block(offset(block_index), block_shape(block_index), block);
+  }
+
+  // decode block with given index to strided array
+  size_t decode(size_t block_index, Scalar* p, ptrdiff_t sx) const
+  {
+    return codec.decode_block_strided(offset(block_index), block_shape(block_index), p, sx);
+  }
+
+protected:
+  using BlockStore<Codec, Index>::alloc;
+  using BlockStore<Codec, Index>::free;
+  using BlockStore<Codec, Index>::offset;
+  using BlockStore<Codec, Index>::shape_code;
+  using BlockStore<Codec, Index>::index;
+  using BlockStore<Codec, Index>::codec;
+
+  // set array dimensions
+  void set_size(size_t nx)
+  {
+    if (nx == 0) {
+      this->nx = 0;
+      bx = 0;
+    }
+    else {
+      this->nx = nx;
+      bx = (nx + 3) / 4;
+    }
+    index.resize(blocks());
+  }
+
+  size_t nx; // array dimensions
+  size_t bx; // array dimensions in number of blocks
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/store2.hpp b/include/zfp/internal/array/store2.hpp
new file mode 100644
index 00000000..466067ac
--- /dev/null
+++ b/include/zfp/internal/array/store2.hpp
@@ -0,0 +1,147 @@
+#ifndef ZFP_STORE2_HPP
+#define ZFP_STORE2_HPP
+
+#include "zfp/internal/array/store.hpp"
+
+namespace zfp {
+namespace internal {
+
+// compressed block store for 2D array
+template <typename Scalar, class Codec, class Index>
+class BlockStore2 : public BlockStore<Codec, Index> {
+public:
+  // default constructor
+  BlockStore2() :
+    nx(0), ny(0),
+    bx(0), by(0)
+  {}
+
+  // block store for array of size nx * ny and given configuration
+  BlockStore2(size_t nx, size_t ny, const zfp_config& config)
+  {
+    set_size(nx, ny);
+    this->set_config(config);
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockStore2& s)
+  {
+    free();
+    BlockStore<Codec, Index>::deep_copy(s);
+    nx = s.nx;
+    ny = s.ny;
+    bx = s.bx;
+    by = s.by;
+  }
+
+  // resize array
+  void resize(size_t nx, size_t ny, bool clear = true)
+  {
+    free();
+    set_size(nx, ny);
+    if (blocks())
+      alloc(clear);
+  }
+
+  // byte size of store data structure components indicated by mask
+  virtual size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  { 
+    size_t size = 0;
+    size += BlockStore<Codec, Index>::size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - sizeof(BlockStore<Codec, Index>);
+    return size;
+  }
+
+  // conservative buffer size 
+  virtual size_t buffer_size() const
+  {
+    zfp_field* field = zfp_field_2d(0, codec.type, nx, ny);
+    size_t size = codec.buffer_size(field);
+    zfp_field_free(field);
+    return size;
+  }
+
+  // number of elements per block
+  virtual size_t block_size() const { return 4 * 4; }
+
+  // total number of blocks
+  virtual size_t blocks() const { return bx * by; }
+
+  // array size in blocks
+  size_t block_size_x() const { return bx; }
+  size_t block_size_y() const { return by; }
+
+  // flat block index for element (i, j)
+  size_t block_index(size_t i, size_t j) const { return (i / 4) + bx * (j / 4); }
+
+  // encoding of block dimensions
+  uint block_shape(size_t block_index) const
+  {
+    size_t i = 4 * (block_index % bx); block_index /= bx;
+    size_t j = 4 * block_index;
+    uint mx = shape_code(i, nx);
+    uint my = shape_code(j, ny);
+    return mx + 4 * my;
+  }
+
+  // encode contiguous block with given index
+  size_t encode(size_t block_index, const Scalar* block)
+  {
+    size_t size = codec.encode_block(offset(block_index), block_shape(block_index), block);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // encode block with given index from strided array
+  size_t encode(size_t block_index, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
+  {
+    size_t size = codec.encode_block_strided(offset(block_index), block_shape(block_index), p, sx, sy);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // decode contiguous block with given index
+  size_t decode(size_t block_index, Scalar* block) const
+  {
+    return codec.decode_block(offset(block_index), block_shape(block_index), block);
+  }
+
+  // decode block with given index to strided array
+  size_t decode(size_t block_index, Scalar* p, ptrdiff_t sx, ptrdiff_t sy) const
+  {
+    return codec.decode_block_strided(offset(block_index), block_shape(block_index), p, sx, sy);
+  }
+
+protected:
+  using BlockStore<Codec, Index>::alloc;
+  using BlockStore<Codec, Index>::free;
+  using BlockStore<Codec, Index>::offset;
+  using BlockStore<Codec, Index>::shape_code;
+  using BlockStore<Codec, Index>::index;
+  using BlockStore<Codec, Index>::codec;
+
+  // set array dimensions
+  void set_size(size_t nx, size_t ny)
+  {
+    if (nx == 0 || ny == 0) {
+      this->nx = this->ny = 0;
+      bx = by = 0;
+    }
+    else {
+      this->nx = nx;
+      this->ny = ny;
+      bx = (nx + 3) / 4;
+      by = (ny + 3) / 4;
+    }
+    index.resize(blocks());
+  }
+
+  size_t nx, ny; // array dimensions
+  size_t bx, by; // array dimensions in number of blocks
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/store3.hpp b/include/zfp/internal/array/store3.hpp
new file mode 100644
index 00000000..cb2afb73
--- /dev/null
+++ b/include/zfp/internal/array/store3.hpp
@@ -0,0 +1,154 @@
+#ifndef ZFP_STORE3_HPP
+#define ZFP_STORE3_HPP
+
+#include "zfp/internal/array/store.hpp"
+
+namespace zfp {
+namespace internal {
+
+// compressed block store for 3D array
+template <typename Scalar, class Codec, class Index>
+class BlockStore3 : public BlockStore<Codec, Index> {
+public:
+  // default constructor
+  BlockStore3() :
+    nx(0), ny(0), nz(0),
+    bx(0), by(0), bz(0)
+  {}
+
+  // block store for array of size nx * ny * nz and given configuration
+  BlockStore3(size_t nx, size_t ny, size_t nz, const zfp_config& config)
+  {
+    set_size(nx, ny, nz);
+    this->set_config(config);
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockStore3& s)
+  {
+    free();
+    BlockStore<Codec, Index>::deep_copy(s);
+    nx = s.nx;
+    ny = s.ny;
+    nz = s.nz;
+    bx = s.bx;
+    by = s.by;
+    bz = s.bz;
+  }
+
+  // resize array
+  void resize(size_t nx, size_t ny, size_t nz, bool clear = true)
+  {
+    free();
+    set_size(nx, ny, nz);
+    if (blocks())
+      alloc(clear);
+  }
+
+  // byte size of store data structure components indicated by mask
+  virtual size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  { 
+    size_t size = 0;
+    size += BlockStore<Codec, Index>::size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - sizeof(BlockStore<Codec, Index>);
+    return size;
+  }
+
+  // conservative buffer size 
+  virtual size_t buffer_size() const
+  {
+    zfp_field* field = zfp_field_3d(0, codec.type, nx, ny, nz);
+    size_t size = codec.buffer_size(field);
+    zfp_field_free(field);
+    return size;
+  }
+
+  // number of elements per block
+  virtual size_t block_size() const { return 4 * 4 * 4; }
+
+  // total number of blocks
+  virtual size_t blocks() const { return bx * by * bz; }
+
+  // array size in blocks
+  size_t block_size_x() const { return bx; }
+  size_t block_size_y() const { return by; }
+  size_t block_size_z() const { return bz; }
+
+  // flat block index for block containing element (i, j, k)
+  size_t block_index(size_t i, size_t j, size_t k) const { return (i / 4) + bx * ((j / 4) + by * (k / 4)); }
+
+  // encoding of block dimensions
+  uint block_shape(size_t block_index) const
+  {
+    size_t i = 4 * (block_index % bx); block_index /= bx;
+    size_t j = 4 * (block_index % by); block_index /= by;
+    size_t k = 4 * block_index;
+    uint mx = shape_code(i, nx);
+    uint my = shape_code(j, ny);
+    uint mz = shape_code(k, nz);
+    return mx + 4 * (my + 4 * mz);
+  }
+
+  // encode contiguous block with given index
+  size_t encode(size_t block_index, const Scalar* block)
+  {
+    size_t size = codec.encode_block(offset(block_index), block_shape(block_index), block);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // encode block with given index from strided array
+  size_t encode(size_t block_index, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+  {
+    size_t size = codec.encode_block_strided(offset(block_index), block_shape(block_index), p, sx, sy, sz);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // decode contiguous block with given index
+  size_t decode(size_t block_index, Scalar* block) const
+  {
+    return codec.decode_block(offset(block_index), block_shape(block_index), block);
+  }
+
+  // decode block with given index to strided array
+  size_t decode(size_t block_index, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz) const
+  {
+    return codec.decode_block_strided(offset(block_index), block_shape(block_index), p, sx, sy, sz);
+  }
+
+protected:
+  using BlockStore<Codec, Index>::alloc;
+  using BlockStore<Codec, Index>::free;
+  using BlockStore<Codec, Index>::offset;
+  using BlockStore<Codec, Index>::shape_code;
+  using BlockStore<Codec, Index>::index;
+  using BlockStore<Codec, Index>::codec;
+
+  // set array dimensions
+  void set_size(size_t nx, size_t ny, size_t nz)
+  {
+    if (nx == 0 || ny == 0 || nz == 0) {
+      this->nx = this->ny = this->nz = 0;
+      bx = by = bz = 0;
+    }
+    else {
+      this->nx = nx;
+      this->ny = ny;
+      this->nz = nz;
+      bx = (nx + 3) / 4;
+      by = (ny + 3) / 4;
+      bz = (nz + 3) / 4;
+    }
+    index.resize(blocks());
+  }
+
+  size_t nx, ny, nz; // array dimensions
+  size_t bx, by, bz; // array dimensions in number of blocks
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/store4.hpp b/include/zfp/internal/array/store4.hpp
new file mode 100644
index 00000000..dbea0c98
--- /dev/null
+++ b/include/zfp/internal/array/store4.hpp
@@ -0,0 +1,161 @@
+#ifndef ZFP_STORE4_HPP
+#define ZFP_STORE4_HPP
+
+#include "zfp/internal/array/store.hpp"
+
+namespace zfp {
+namespace internal {
+
+// compressed block store for 4D array
+template <typename Scalar, class Codec, class Index>
+class BlockStore4 : public BlockStore<Codec, Index> {
+public:
+  // default constructor
+  BlockStore4() :
+    nx(0), ny(0), nz(0), nw(0),
+    bx(0), by(0), bz(0), bw(0)
+  {}
+
+  // block store for array of size nx * ny * nz * nw and given configuration
+  BlockStore4(size_t nx, size_t ny, size_t nz, size_t nw, const zfp_config& config)
+  {
+    set_size(nx, ny, nz, nw);
+    this->set_config(config);
+  }
+
+  // perform a deep copy
+  void deep_copy(const BlockStore4& s)
+  {
+    free();
+    BlockStore<Codec, Index>::deep_copy(s);
+    nx = s.nx;
+    ny = s.ny;
+    nz = s.nz;
+    nw = s.nw;
+    bx = s.bx;
+    by = s.by;
+    bz = s.bz;
+    bw = s.bw;
+  }
+
+  // resize array
+  void resize(size_t nx, size_t ny, size_t nz, size_t nw, bool clear = true)
+  {
+    free();
+    set_size(nx, ny, nz, nw);
+    if (blocks())
+      alloc(clear);
+  }
+
+  // byte size of store data structure components indicated by mask
+  virtual size_t size_bytes(uint mask = ZFP_DATA_ALL) const
+  { 
+    size_t size = 0;
+    size += BlockStore<Codec, Index>::size_bytes(mask);
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - sizeof(BlockStore<Codec, Index>);
+    return size;
+  }
+
+  // conservative buffer size 
+  virtual size_t buffer_size() const
+  {
+    zfp_field* field = zfp_field_4d(0, codec.type, nx, ny, nz, nw);
+    size_t size = codec.buffer_size(field);
+    zfp_field_free(field);
+    return size;
+  }
+
+  // number of elements per block
+  virtual size_t block_size() const { return 4 * 4 * 4 * 4; }
+
+  // total number of blocks
+  virtual size_t blocks() const { return bx * by * bz * bw; }
+
+  // array size in blocks
+  size_t block_size_x() const { return bx; }
+  size_t block_size_y() const { return by; }
+  size_t block_size_z() const { return bz; }
+  size_t block_size_w() const { return bw; }
+
+  // flat block index for element (i, j, k, l)
+  size_t block_index(size_t i, size_t j, size_t k, size_t l) const { return (i / 4) + bx * ((j / 4) + by * ((k / 4) + bz * (l / 4))); }
+
+  // encoding of block dimensions
+  uint block_shape(size_t block_index) const
+  {
+    size_t i = 4 * (block_index % bx); block_index /= bx;
+    size_t j = 4 * (block_index % by); block_index /= by;
+    size_t k = 4 * (block_index % bz); block_index /= bz;
+    size_t l = 4 * block_index;
+    uint mx = shape_code(i, nx);
+    uint my = shape_code(j, ny);
+    uint mz = shape_code(k, nz);
+    uint mw = shape_code(l, nw);
+    return mx + 4 * (my + 4 * (mz + 4 * mw));
+  }
+
+  // encode contiguous block with given index
+  size_t encode(size_t block_index, const Scalar* block)
+  {
+    size_t size = codec.encode_block(offset(block_index), block_shape(block_index), block);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // encode block with given index from strided array
+  size_t encode(size_t block_index, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+  {
+    size_t size = codec.encode_block_strided(offset(block_index), block_shape(block_index), p, sx, sy, sz, sw);
+    index.set_block_size(block_index, size);
+    return size;
+  }
+
+  // decode contiguous block with given index
+  size_t decode(size_t block_index, Scalar* block) const
+  {
+    return codec.decode_block(offset(block_index), block_shape(block_index), block);
+  }
+
+  // decode block with given index to strided array
+  size_t decode(size_t block_index, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw) const
+  {
+    return codec.decode_block_strided(offset(block_index), block_shape(block_index), p, sx, sy, sz, sw);
+  }
+
+protected:
+  using BlockStore<Codec, Index>::alloc;
+  using BlockStore<Codec, Index>::free;
+  using BlockStore<Codec, Index>::offset;
+  using BlockStore<Codec, Index>::shape_code;
+  using BlockStore<Codec, Index>::index;
+  using BlockStore<Codec, Index>::codec;
+
+  // set array dimensions
+  void set_size(size_t nx, size_t ny, size_t nz, size_t nw)
+  {
+    if (nx == 0 || ny == 0 || nz == 0 || nw == 0) {
+      this->nx = this->ny = this->nz = this->nw = 0;
+      bx = by = bz = bw = 0;
+    }
+    else {
+      this->nx = nx;
+      this->ny = ny;
+      this->nz = nz;
+      this->nw = nw;
+      bx = (nx + 3) / 4;
+      by = (ny + 3) / 4;
+      bz = (nz + 3) / 4;
+      bw = (nw + 3) / 4;
+    }
+    index.resize(blocks());
+  }
+
+  size_t nx, ny, nz, nw; // array dimensions
+  size_t bx, by, bz, bw; // array dimensions in number of blocks
+};
+
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/traits.hpp b/include/zfp/internal/array/traits.hpp
new file mode 100644
index 00000000..7ec4a02b
--- /dev/null
+++ b/include/zfp/internal/array/traits.hpp
@@ -0,0 +1,30 @@
+#ifndef ZFP_TRAITS_HPP
+#define ZFP_TRAITS_HPP
+
+namespace zfp {
+namespace internal {
+
+// useful type traits
+template <typename Scalar>
+struct trait;
+/*
+  static const zfp_type type;    // corresponding zfp type
+  static const size_t precision; // precision in number of bits
+*/
+
+template <>
+struct trait<float> {
+  static const zfp_type type = zfp_type_float;
+  static const size_t precision = CHAR_BIT * sizeof(float);
+};
+
+template <>
+struct trait<double> {
+  static const zfp_type type = zfp_type_double;
+  static const size_t precision = CHAR_BIT * sizeof(double);
+};
+
+}
+}
+
+#endif
diff --git a/include/zfp/internal/array/view1.hpp b/include/zfp/internal/array/view1.hpp
new file mode 100644
index 00000000..adfe868b
--- /dev/null
+++ b/include/zfp/internal/array/view1.hpp
@@ -0,0 +1,303 @@
+#ifndef ZFP_VIEW1_HPP
+#define ZFP_VIEW1_HPP
+
+// 1D array views
+
+namespace zfp {
+namespace internal {
+namespace dim1 {
+
+// abstract view of 1D array (base class)
+template <class Container>
+class preview {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return nx; }
+
+  // local to global array index
+  size_t global_x(size_t i) const { return x + i; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(container_type* array) : array(array), x(0), nx(array->size_x()) {}
+  explicit preview(container_type* array, size_t x, size_t nx) : array(array), x(x), nx(nx) {}
+  preview& operator=(container_type* a)
+  {
+    array = a;
+    x = 0;
+    nx = a->nx;
+    return *this;
+  }
+
+  // global index bounds for iterators
+  size_t min_x() const { return x; }
+  size_t max_x() const { return x + nx; }
+
+  container_type* array; // underlying container
+  size_t x;              // offset into array
+  size_t nx;             // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 1D array
+template <class Container>
+class const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim1::const_reference<const_view> const_reference;
+  typedef typename zfp::internal::dim1::const_pointer<const_view> const_pointer;
+  typedef typename zfp::internal::dim1::const_iterator<const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  const_view(container_type* array) : preview<Container>(array) {}
+  const_view(container_type* array, size_t x, size_t nx) : preview<Container>(array, x, nx) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+
+  // [i] inspector
+  const_reference operator[](size_t index) const { return const_reference(this, x + index); }
+
+  // (i) inspector
+  const_reference operator()(size_t i) const { return const_reference(this, x + i); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x); }
+  const_iterator cend() const { return const_iterator(this, x + nx); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim1::const_handle<const_view>;
+  friend class zfp::internal::dim1::const_pointer<const_view>;
+  friend class zfp::internal::dim1::const_iterator<const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::nx;
+
+  // inspector
+  value_type get(size_t x) const { return array->get(x); }
+};
+
+// generic read-write view into a rectangular subset of a 1D array
+template <class Container>
+class view : public const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim1::const_reference<view> const_reference;
+  typedef typename zfp::internal::dim1::const_pointer<view> const_pointer;
+  typedef typename zfp::internal::dim1::const_iterator<view> const_iterator;
+  typedef typename zfp::internal::dim1::reference<view> reference;
+  typedef typename zfp::internal::dim1::pointer<view> pointer;
+  typedef typename zfp::internal::dim1::iterator<view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  view(container_type* array) : const_view<Container>(array) {}
+  view(container_type* array, size_t x, size_t nx) : const_view<Container>(array, x, nx) {}
+
+  // [i] inspector
+  const_reference operator[](size_t index) const { return const_reference(this, x + index); }
+
+  // (i) inspector
+  const_reference operator()(size_t i) const { return const_reference(this, x + i); }
+
+  // [i] mutator
+  reference operator[](size_t index) { return reference(this, x + index); }
+
+  // (i) mutator
+  reference operator()(size_t i) { return reference(this, x + i); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x); }
+  const_iterator cend() const { return const_iterator(this, x + nx); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x); }
+  iterator end() { return iterator(this, x + nx); }
+
+protected:
+  friend class zfp::internal::dim1::const_handle<view>;
+  friend class zfp::internal::dim1::const_pointer<view>;
+  friend class zfp::internal::dim1::const_iterator<view>;
+  friend class zfp::internal::dim1::reference<view>;
+  friend class zfp::internal::dim1::pointer<view>;
+  friend class zfp::internal::dim1::iterator<view>;
+
+  using const_view<Container>::min_x;
+  using const_view<Container>::max_x;
+  using const_view<Container>::get;
+  using const_view<Container>::array;
+  using const_view<Container>::x;
+  using const_view<Container>::nx;
+
+  // mutator
+  void set(size_t x, value_type val) { array->set(x, val); }
+
+  // in-place updates
+  void add(size_t x, value_type val) { array->add(x, val); }
+  void sub(size_t x, value_type val) { array->sub(x, val); }
+  void mul(size_t x, value_type val) { array->mul(x, val); }
+  void div(size_t x, value_type val) { array->div(x, val); }
+};
+
+// thread-safe read-only view of 1D (sub)array with private cache
+template <class Container>
+class private_const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename container_type::store_type store_type;
+  typedef typename zfp::internal::dim1::const_reference<private_const_view> const_reference;
+  typedef typename zfp::internal::dim1::const_pointer<private_const_view> const_pointer;
+  typedef typename zfp::internal::dim1::const_iterator<private_const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_const_view(container_type* array, size_t cache_size = 0) :
+    preview<Container>(array),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+  private_const_view(container_type* array, size_t x, size_t nx, size_t cache_size = 0) :
+    preview<Container>(array, x, nx),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes) { cache.resize(bytes); }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i) inspector
+  const_reference operator()(size_t i) const { return const_reference(this, x + i); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x); }
+  const_iterator cend() const { return const_iterator(this, x + nx); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim1::const_handle<private_const_view>;
+  friend class zfp::internal::dim1::const_pointer<private_const_view>;
+  friend class zfp::internal::dim1::const_iterator<private_const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::nx;
+
+  // inspector
+  value_type get(size_t x) const { return cache.get(x); }
+
+  BlockCache1<value_type, store_type> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 1D (sub)array
+template <class Container>
+class private_view : public private_const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim1::const_reference<private_view> const_reference;
+  typedef typename zfp::internal::dim1::const_pointer<private_view> const_pointer;
+  typedef typename zfp::internal::dim1::const_iterator<private_view> const_iterator;
+  typedef typename zfp::internal::dim1::reference<private_view> reference;
+  typedef typename zfp::internal::dim1::pointer<private_view> pointer;
+  typedef typename zfp::internal::dim1::iterator<private_view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_view(container_type* array, size_t cache_size = 0) : private_const_view<Container>(array, cache_size) {}
+  private_view(container_type* array, size_t x, size_t nx, size_t cache_size = 0) : private_const_view<Container>(array, x, nx, cache_size) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(size_t index, size_t count)
+  {
+    partition(x, nx, index, count);
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // (i) inspector
+  const_reference operator()(size_t i) const { return const_reference(this, x + i); }
+
+  // (i) mutator
+  reference operator()(size_t i) { return reference(this, x + i); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x); }
+  const_iterator cend() const { return const_iterator(this, x + nx); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x); }
+  iterator end() { return iterator(this, x + nx); }
+
+protected:
+  friend class zfp::internal::dim1::const_handle<private_view>;
+  friend class zfp::internal::dim1::const_pointer<private_view>;
+  friend class zfp::internal::dim1::const_iterator<private_view>;
+  friend class zfp::internal::dim1::reference<private_view>;
+  friend class zfp::internal::dim1::pointer<private_view>;
+  friend class zfp::internal::dim1::iterator<private_view>;
+
+  using private_const_view<Container>::min_x;
+  using private_const_view<Container>::max_x;
+  using private_const_view<Container>::get;
+  using private_const_view<Container>::array;
+  using private_const_view<Container>::x;
+  using private_const_view<Container>::nx;
+  using private_const_view<Container>::cache;
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(size_t& offset, size_t& size, size_t index, size_t count)
+  {
+    size_t bmin = offset / 4;
+    size_t bmax = (offset + size + 3) / 4;
+    size_t xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    size_t xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(size_t x, value_type val) { cache.set(x, val); }
+
+  // in-place updates
+  void add(size_t x, value_type val) { cache.ref(x) += val; }
+  void sub(size_t x, value_type val) { cache.ref(x) -= val; }
+  void mul(size_t x, value_type val) { cache.ref(x) *= val; }
+  void div(size_t x, value_type val) { cache.ref(x) /= val; }
+};
+
+} // dim1
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/view2.hpp b/include/zfp/internal/array/view2.hpp
new file mode 100644
index 00000000..8e12336f
--- /dev/null
+++ b/include/zfp/internal/array/view2.hpp
@@ -0,0 +1,498 @@
+#ifndef ZFP_VIEW2_HPP
+#define ZFP_VIEW2_HPP
+
+// 2D array views
+
+namespace zfp {
+namespace internal {
+namespace dim2 {
+
+// abstract view of 2D array (base class)
+template <class Container>
+class preview {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return nx * ny; }
+
+  // local to global array indices
+  size_t global_x(size_t i) const { return x + i; }
+  size_t global_y(size_t j) const { return y + j; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(container_type* array) : array(array), x(0), y(0), nx(array->size_x()), ny(array->size_y()) {}
+  explicit preview(container_type* array, size_t x, size_t y, size_t nx, size_t ny) : array(array), x(x), y(y), nx(nx), ny(ny) {}
+  preview& operator=(container_type* a)
+  {
+    array = a;
+    x = y = 0;
+    nx = a->nx;
+    ny = a->ny;
+    return *this;
+  }
+
+  // global index bounds for iterators
+  size_t min_x() const { return x; }
+  size_t max_x() const { return x + nx; }
+  size_t min_y() const { return y; }
+  size_t max_y() const { return y + ny; }
+
+  container_type* array; // underlying container
+  size_t x, y;           // offset into array
+  size_t nx, ny;         // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 2D array
+template <class Container>
+class const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim2::const_reference<const_view> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<const_view> const_pointer;
+  typedef typename zfp::internal::dim2::const_iterator<const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  const_view(container_type* array) : preview<Container>(array) {}
+  const_view(container_type* array, size_t x, size_t y, size_t nx, size_t ny) : preview<Container>(array, x, y, nx, ny) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // (i, j) inspector
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y); }
+  const_iterator cend() const { return const_iterator(this, x, y + ny); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<const_view>;
+  friend class zfp::internal::dim2::const_pointer<const_view>;
+  friend class zfp::internal::dim2::const_iterator<const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::min_y;
+  using preview<Container>::max_y;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+
+  // inspector
+  value_type get(size_t x, size_t y) const { return array->get(x, y); }
+};
+
+// generic read-write view into a rectangular subset of a 2D array
+template <class Container>
+class view : public const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim2::const_reference<view> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<view> const_pointer;
+  typedef typename zfp::internal::dim2::const_iterator<view> const_iterator;
+  typedef typename zfp::internal::dim2::reference<view> reference;
+  typedef typename zfp::internal::dim2::pointer<view> pointer;
+  typedef typename zfp::internal::dim2::iterator<view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  view(container_type* array) : const_view<Container>(array) {}
+  view(container_type* array, size_t x, size_t y, size_t nx, size_t ny) : const_view<Container>(array, x, y, nx, ny) {}
+
+  // (i, j) inspector
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j); }
+
+  // (i, j) mutator
+  reference operator()(size_t i, size_t j) { return reference(this, x + i, y + j); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y); }
+  const_iterator cend() const { return const_iterator(this, x, y + ny); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x, y); }
+  iterator end() { return iterator(this, x, y + ny); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<view>;
+  friend class zfp::internal::dim2::const_pointer<view>;
+  friend class zfp::internal::dim2::const_iterator<view>;
+  friend class zfp::internal::dim2::reference<view>;
+  friend class zfp::internal::dim2::pointer<view>;
+  friend class zfp::internal::dim2::iterator<view>;
+
+  using const_view<Container>::min_x;
+  using const_view<Container>::max_x;
+  using const_view<Container>::min_y;
+  using const_view<Container>::max_y;
+  using const_view<Container>::get;
+  using const_view<Container>::array;
+  using const_view<Container>::x;
+  using const_view<Container>::y;
+  using const_view<Container>::nx;
+  using const_view<Container>::ny;
+
+  // mutator
+  void set(size_t x, size_t y, value_type val) { array->set(x, y, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, value_type val) { array->add(x, y, val); }
+  void sub(size_t x, size_t y, value_type val) { array->sub(x, y, val); }
+  void mul(size_t x, size_t y, value_type val) { array->mul(x, y, val); }
+  void div(size_t x, size_t y, value_type val) { array->div(x, y, val); }
+};
+
+// flat view of 2D array (operator[] returns scalar)
+template <class Container>
+class flat_view : public view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim2::const_reference<flat_view> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<flat_view> const_pointer;
+  typedef typename zfp::internal::dim2::reference<flat_view> reference;
+  typedef typename zfp::internal::dim2::pointer<flat_view> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  flat_view(container_type* array) : view<Container>(array) {}
+  flat_view(container_type* array, size_t x, size_t y, size_t nx, size_t ny) : view<Container>(array, x, y, nx, ny) {}
+
+  // convert (i, j) index to flat index
+  size_t index(size_t i, size_t j) const { return i + nx * j; }
+
+  // convert flat index to (i, j) index
+  void ij(size_t& i, size_t& j, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index;
+  }
+
+  // flat index [] inspector
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j;
+    ij(i, j, index);
+    return const_reference(this, x + i, y + j);
+  }
+
+  // flat index [] mutator
+  reference operator[](size_t index)
+  {
+    size_t i, j;
+    ij(i, j, index);
+    return reference(this, x + i, y + j);
+  }
+
+  // (i, j) inspector
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j); }
+
+  // (i, j) mutator
+  reference operator()(size_t i, size_t j) { return reference(this, x + i, y + j); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<flat_view>;
+  friend class zfp::internal::dim2::const_pointer<flat_view>;
+  friend class zfp::internal::dim2::reference<flat_view>;
+  friend class zfp::internal::dim2::pointer<flat_view>;
+
+  using view<Container>::array;
+  using view<Container>::x;
+  using view<Container>::y;
+  using view<Container>::nx;
+  using view<Container>::ny;
+
+  // inspector
+  value_type get(size_t x, size_t y) const { return array->get(x, y); }
+
+  // mutator
+  void set(size_t x, size_t y, value_type val) { array->set(x, y, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, value_type val) { array->add(x, y, val); }
+  void sub(size_t x, size_t y, value_type val) { array->sub(x, y, val); }
+  void mul(size_t x, size_t y, value_type val) { array->mul(x, y, val); }
+  void div(size_t x, size_t y, value_type val) { array->div(x, y, val); }
+};
+
+// forward declaration of friends
+template <class Container> class nested_view1;
+template <class Container> class nested_view2;
+
+// nested view into a 1D rectangular subset of a 2D array
+template <class Container>
+class nested_view1 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim2::const_reference<nested_view1> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<nested_view1> const_pointer;
+  typedef typename zfp::internal::dim2::reference<nested_view1> reference;
+  typedef typename zfp::internal::dim2::pointer<nested_view1> pointer;
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+
+  // [i] inspector and mutator
+  const_reference operator[](size_t index) const { return const_reference(this, x + index, y); }
+  reference operator[](size_t index) { return reference(this, x + index, y); }
+
+  // (i) inspector and mutator
+  const_reference operator()(size_t i) const { return const_reference(this, x + i, y); }
+  reference operator()(size_t i) { return reference(this, x + i, y); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<nested_view1>;
+  friend class zfp::internal::dim2::const_pointer<nested_view1>;
+  friend class zfp::internal::dim2::reference<nested_view1>;
+  friend class zfp::internal::dim2::pointer<nested_view1>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+
+  // construction--perform shallow copy of (sub)array
+  friend class nested_view2<Container>;
+  explicit nested_view1(container_type* array) : preview<Container>(array) {}
+  explicit nested_view1(container_type* array, size_t x, size_t y, size_t nx, size_t ny) : preview<Container>(array, x, y, nx, ny) {}
+
+  // inspector
+  value_type get(size_t x, size_t y) const { return array->get(x, y); }
+
+  // mutator
+  void set(size_t x, size_t y, value_type val) { array->set(x, y, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, value_type val) { array->add(x, y, val); }
+  void sub(size_t x, size_t y, value_type val) { array->sub(x, y, val); }
+  void mul(size_t x, size_t y, value_type val) { array->mul(x, y, val); }
+  void div(size_t x, size_t y, value_type val) { array->div(x, y, val); }
+};
+
+// nested view into a 2D rectangular subset of a 2D array
+template <class Container>
+class nested_view2 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim2::const_reference<nested_view2> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<nested_view2> const_pointer;
+  typedef typename zfp::internal::dim2::reference<nested_view2> reference;
+  typedef typename zfp::internal::dim2::pointer<nested_view2> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  nested_view2(container_type* array) : preview<Container>(array) {}
+  nested_view2(container_type* array, size_t x, size_t y, size_t nx, size_t ny) : preview<Container>(array, x, y, nx, ny) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // 1D view
+  nested_view1<Container> operator[](size_t index) const { return nested_view1<Container>(array, x, y + index, nx, 1); }
+
+  // (i, j) inspector and mutator
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j); }
+  reference operator()(size_t i, size_t j) { return reference(this, x + i, y + j); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<nested_view2>;
+  friend class zfp::internal::dim2::const_pointer<nested_view2>;
+  friend class zfp::internal::dim2::reference<nested_view2>;
+  friend class zfp::internal::dim2::pointer<nested_view2>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+
+  // inspector
+  value_type get(size_t x, size_t y) const { return array->get(x, y); }
+
+  // mutator
+  void set(size_t x, size_t y, value_type val) { array->set(x, y, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, value_type val) { array->add(x, y, val); }
+  void sub(size_t x, size_t y, value_type val) { array->sub(x, y, val); }
+  void mul(size_t x, size_t y, value_type val) { array->mul(x, y, val); }
+  void div(size_t x, size_t y, value_type val) { array->div(x, y, val); }
+};
+
+// thread-safe read-only view of 2D (sub)array with private cache
+template <class Container>
+class private_const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename container_type::store_type store_type;
+  typedef typename zfp::internal::dim2::const_reference<private_const_view> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<private_const_view> const_pointer;
+  typedef typename zfp::internal::dim2::const_iterator<private_const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_const_view(container_type* array, size_t cache_size = 0) :
+    preview<Container>(array),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+  private_const_view(container_type* array, size_t x, size_t y, size_t nx, size_t ny, size_t cache_size = 0) :
+    preview<Container>(array, x, y, nx, ny),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes) { cache.resize(bytes); }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i, j) inspector
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y); }
+  const_iterator cend() const { return const_iterator(this, x, y + ny); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<private_const_view>;
+  friend class zfp::internal::dim2::const_pointer<private_const_view>;
+  friend class zfp::internal::dim2::const_iterator<private_const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::min_y;
+  using preview<Container>::max_y;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+
+  // inspector
+  value_type get(size_t x, size_t y) const { return cache.get(x, y); }
+
+  BlockCache2<value_type, store_type> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 2D (sub)array
+template <class Container>
+class private_view : public private_const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim2::const_reference<private_view> const_reference;
+  typedef typename zfp::internal::dim2::const_pointer<private_view> const_pointer;
+  typedef typename zfp::internal::dim2::const_iterator<private_view> const_iterator;
+  typedef typename zfp::internal::dim2::reference<private_view> reference;
+  typedef typename zfp::internal::dim2::pointer<private_view> pointer;
+  typedef typename zfp::internal::dim2::iterator<private_view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_view(container_type* array, size_t cache_size = 0) : private_const_view<Container>(array, cache_size) {}
+  private_view(container_type* array, size_t x, size_t y, size_t nx, size_t ny, size_t cache_size = 0) : private_const_view<Container>(array, x, y, nx, ny, cache_size) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(size_t index, size_t count)
+  {
+    if (nx > ny)
+      partition(x, nx, index, count);
+    else
+      partition(y, ny, index, count);
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // (i, j) inspector
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j); }
+
+  // (i, j) mutator
+  reference operator()(size_t i, size_t j) { return reference(this, x + i, y + j); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y); }
+  const_iterator cend() const { return const_iterator(this, x, y + ny); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x, y); }
+  iterator end() { return iterator(this, x, y + ny); }
+
+protected:
+  friend class zfp::internal::dim2::const_handle<private_view>;
+  friend class zfp::internal::dim2::const_pointer<private_view>;
+  friend class zfp::internal::dim2::const_iterator<private_view>;
+  friend class zfp::internal::dim2::reference<private_view>;
+  friend class zfp::internal::dim2::pointer<private_view>;
+  friend class zfp::internal::dim2::iterator<private_view>;
+
+  using private_const_view<Container>::min_x;
+  using private_const_view<Container>::max_x;
+  using private_const_view<Container>::min_y;
+  using private_const_view<Container>::max_y;
+  using private_const_view<Container>::get;
+  using private_const_view<Container>::array;
+  using private_const_view<Container>::x;
+  using private_const_view<Container>::y;
+  using private_const_view<Container>::nx;
+  using private_const_view<Container>::ny;
+  using private_const_view<Container>::cache;
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(size_t& offset, size_t& size, size_t index, size_t count)
+  {
+    size_t bmin = offset / 4;
+    size_t bmax = (offset + size + 3) / 4;
+    size_t xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    size_t xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(size_t x, size_t y, value_type val) { cache.set(x, y, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, value_type val) { cache.ref(x, y) += val; }
+  void sub(size_t x, size_t y, value_type val) { cache.ref(x, y) -= val; }
+  void mul(size_t x, size_t y, value_type val) { cache.ref(x, y) *= val; }
+  void div(size_t x, size_t y, value_type val) { cache.ref(x, y) /= val; }
+};
+
+} // dim2
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/view3.hpp b/include/zfp/internal/array/view3.hpp
new file mode 100644
index 00000000..24ceb8f4
--- /dev/null
+++ b/include/zfp/internal/array/view3.hpp
@@ -0,0 +1,584 @@
+#ifndef ZFP_VIEW3_HPP
+#define ZFP_VIEW3_HPP
+
+// 3D array views
+
+namespace zfp {
+namespace internal {
+namespace dim3 {
+
+// abstract view of 3D array (base class)
+template <class Container>
+class preview {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return nx * ny * nz; }
+
+  // local to global array indices
+  size_t global_x(size_t i) const { return x + i; }
+  size_t global_y(size_t j) const { return y + j; }
+  size_t global_z(size_t k) const { return z + k; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(container_type* array) : array(array), x(0), y(0), z(0), nx(array->size_x()), ny(array->size_y()), nz(array->size_z()) {}
+  explicit preview(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : array(array), x(x), y(y), z(z), nx(nx), ny(ny), nz(nz) {}
+  preview& operator=(container_type* a)
+  {
+    array = a;
+    x = y = z = 0;
+    nx = a->nx;
+    ny = a->ny;
+    nz = a->nz;
+    return *this;
+  }
+
+  // global index bounds for iterators
+  size_t min_x() const { return x; }
+  size_t max_x() const { return x + nx; }
+  size_t min_y() const { return y; }
+  size_t max_y() const { return y + ny; }
+  size_t min_z() const { return z; }
+  size_t max_z() const { return z + nz; }
+
+  container_type* array; // underlying container
+  size_t x, y, z;        // offset into array
+  size_t nx, ny, nz;     // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 3D array
+template <class Container>
+class const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<const_view> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<const_view> const_pointer;
+  typedef typename zfp::internal::dim3::const_iterator<const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  const_view(container_type* array) : preview<Container>(array) {}
+  const_view(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : preview<Container>(array, x, y, z, nx, ny, nz) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+
+  // (i, j, k) inspector
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z); }
+  const_iterator cend() const { return const_iterator(this, x, y, z + nz); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<const_view>;
+  friend class zfp::internal::dim3::const_pointer<const_view>;
+  friend class zfp::internal::dim3::const_iterator<const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::min_y;
+  using preview<Container>::max_y;
+  using preview<Container>::min_z;
+  using preview<Container>::max_z;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z) const { return array->get(x, y, z); }
+};
+
+// generic read-write view into a rectangular subset of a 3D array
+template <class Container>
+class view : public const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<view> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<view> const_pointer;
+  typedef typename zfp::internal::dim3::const_iterator<view> const_iterator;
+  typedef typename zfp::internal::dim3::reference<view> reference;
+  typedef typename zfp::internal::dim3::pointer<view> pointer;
+  typedef typename zfp::internal::dim3::iterator<view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  view(container_type* array) : const_view<Container>(array) {}
+  view(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : const_view<Container>(array, x, y, z, nx, ny, nz) {}
+
+  // (i, j, k) inspector
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k); }
+
+  // (i, j, k) mutator
+  reference operator()(size_t i, size_t j, size_t k) { return reference(this, x + i, y + j, z + k); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z); }
+  const_iterator cend() const { return const_iterator(this, x, y, z + nz); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x, y, z); }
+  iterator end() { return iterator(this, x, y, z + nz); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<view>;
+  friend class zfp::internal::dim3::const_pointer<view>;
+  friend class zfp::internal::dim3::const_iterator<view>;
+  friend class zfp::internal::dim3::reference<view>;
+  friend class zfp::internal::dim3::pointer<view>;
+  friend class zfp::internal::dim3::iterator<view>;
+
+  using const_view<Container>::min_x;
+  using const_view<Container>::max_x;
+  using const_view<Container>::min_y;
+  using const_view<Container>::max_y;
+  using const_view<Container>::min_z;
+  using const_view<Container>::max_z;
+  using const_view<Container>::get;
+  using const_view<Container>::array;
+  using const_view<Container>::x;
+  using const_view<Container>::y;
+  using const_view<Container>::z;
+  using const_view<Container>::nx;
+  using const_view<Container>::ny;
+  using const_view<Container>::nz;
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, value_type val) { array->set(x, y, z, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, value_type val) { array->add(x, y, z, val); }
+  void sub(size_t x, size_t y, size_t z, value_type val) { array->sub(x, y, z, val); }
+  void mul(size_t x, size_t y, size_t z, value_type val) { array->mul(x, y, z, val); }
+  void div(size_t x, size_t y, size_t z, value_type val) { array->div(x, y, z, val); }
+};
+
+// flat view of 3D array (operator[] returns scalar)
+template <class Container>
+class flat_view : public view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<flat_view> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<flat_view> const_pointer;
+  typedef typename zfp::internal::dim3::reference<flat_view> reference;
+  typedef typename zfp::internal::dim3::pointer<flat_view> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  flat_view(container_type* array) : view<Container>(array) {}
+  flat_view(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : view<Container>(array, x, y, z, nx, ny, nz) {}
+
+  // convert (i, j, k) index to flat index
+  size_t index(size_t i, size_t j, size_t k) const { return i + nx * (j + ny * k); }
+
+  // convert flat index to (i, j, k) index
+  void ijk(size_t& i, size_t& j, size_t& k, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index;
+  }
+
+  // flat index [] inspector
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j, k;
+    ijk(i, j, k, index);
+    return const_reference(this, x + i, y + j, z + k);
+  }
+
+  // flat index [] mutator
+  reference operator[](size_t index)
+  {
+    size_t i, j, k;
+    ijk(i, j, k, index);
+    return reference(this, x + i, y + j, z + k);
+  }
+
+  // (i, j, k) inspector
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k); }
+
+  // (i, j, k) mutator
+  reference operator()(size_t i, size_t j, size_t k) { return reference(this, x + i, y + j, z + k); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<flat_view>;
+  friend class zfp::internal::dim3::const_pointer<flat_view>;
+  friend class zfp::internal::dim3::reference<flat_view>;
+  friend class zfp::internal::dim3::pointer<flat_view>;
+
+  using view<Container>::array;
+  using view<Container>::x;
+  using view<Container>::y;
+  using view<Container>::z;
+  using view<Container>::nx;
+  using view<Container>::ny;
+  using view<Container>::nz;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z) const { return array->get(x, y, z); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, value_type val) { array->set(x, y, z, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, value_type val) { array->add(x, y, z, val); }
+  void sub(size_t x, size_t y, size_t z, value_type val) { array->sub(x, y, z, val); }
+  void mul(size_t x, size_t y, size_t z, value_type val) { array->mul(x, y, z, val); }
+  void div(size_t x, size_t y, size_t z, value_type val) { array->div(x, y, z, val); }
+};
+
+// forward declaration of friends
+template <class Container> class nested_view1;
+template <class Container> class nested_view2;
+template <class Container> class nested_view3;
+
+// nested view into a 1D rectangular subset of a 3D array
+template <class Container>
+class nested_view1 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<nested_view1> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<nested_view1> const_pointer;
+  typedef typename zfp::internal::dim3::reference<nested_view1> reference;
+  typedef typename zfp::internal::dim3::pointer<nested_view1> pointer;
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+
+  // [i] inspector and mutator
+  const_reference operator[](size_t index) const { return const_reference(this, x + index, y, z); }
+  reference operator[](size_t index) { return reference(this, x + index, y, z); }
+
+  // (i) inspector and mutator
+  const_reference operator()(size_t i) const { return const_reference(this, x + i, y, z); }
+  reference operator()(size_t i) { return reference(this, x + i, y, z); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<nested_view1>;
+  friend class zfp::internal::dim3::const_pointer<nested_view1>;
+  friend class zfp::internal::dim3::reference<nested_view1>;
+  friend class zfp::internal::dim3::pointer<nested_view1>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+
+  // construction--perform shallow copy of (sub)array
+  friend class nested_view2<Container>;
+  explicit nested_view1(container_type* array) : preview<Container>(array) {}
+  explicit nested_view1(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : preview<Container>(array, x, y, z, nx, ny, nz) {}
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z) const { return array->get(x, y, z); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, value_type val) { array->set(x, y, z, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, value_type val) { array->add(x, y, z, val); }
+  void sub(size_t x, size_t y, size_t z, value_type val) { array->sub(x, y, z, val); }
+  void mul(size_t x, size_t y, size_t z, value_type val) { array->mul(x, y, z, val); }
+  void div(size_t x, size_t y, size_t z, value_type val) { array->div(x, y, z, val); }
+};
+
+// nested view into a 2D rectangular subset of a 3D array
+template <class Container>
+class nested_view2 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<nested_view2> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<nested_view2> const_pointer;
+  typedef typename zfp::internal::dim3::reference<nested_view2> reference;
+  typedef typename zfp::internal::dim3::pointer<nested_view2> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  nested_view2(container_type* array) : preview<Container>(array) {}
+  nested_view2(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : preview<Container>(array, x, y, z, nx, ny, nz) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // 1D view
+  nested_view1<Container> operator[](size_t index) const { return nested_view1<Container>(array, x, y + index, z, nx, 1, 1); }
+
+  // (i, j) inspector and mutator
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j, z); }
+  reference operator()(size_t i, size_t j) { return reference(this, x + i, y + j, z); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<nested_view2>;
+  friend class zfp::internal::dim3::const_pointer<nested_view2>;
+  friend class zfp::internal::dim3::reference<nested_view2>;
+  friend class zfp::internal::dim3::pointer<nested_view2>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z) const { return array->get(x, y, z); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, value_type val) { array->set(x, y, z, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, value_type val) { array->add(x, y, z, val); }
+  void sub(size_t x, size_t y, size_t z, value_type val) { array->sub(x, y, z, val); }
+  void mul(size_t x, size_t y, size_t z, value_type val) { array->mul(x, y, z, val); }
+  void div(size_t x, size_t y, size_t z, value_type val) { array->div(x, y, z, val); }
+};
+
+// nested view into a 3D rectangular subset of a 3D array
+template <class Container>
+class nested_view3 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<nested_view3> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<nested_view3> const_pointer;
+  typedef typename zfp::internal::dim3::reference<nested_view3> reference;
+  typedef typename zfp::internal::dim3::pointer<nested_view3> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  nested_view3(container_type* array) : preview<Container>(array) {}
+  nested_view3(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz) : preview<Container>(array, x, y, z, nx, ny, nz) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+
+  // 2D view
+  nested_view2<Container> operator[](size_t index) const { return nested_view2<Container>(array, x, y, z + index, nx, ny, 1); }
+
+  // (i, j, k) inspector and mutator
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k); }
+  reference operator()(size_t i, size_t j, size_t k) { return reference(this, x + i, y + j, z + k); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<nested_view3>;
+  friend class zfp::internal::dim3::const_pointer<nested_view3>;
+  friend class zfp::internal::dim3::reference<nested_view3>;
+  friend class zfp::internal::dim3::pointer<nested_view3>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z) const { return array->get(x, y, z); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, value_type val) { array->set(x, y, z, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, value_type val) { array->add(x, y, z, val); }
+  void sub(size_t x, size_t y, size_t z, value_type val) { array->sub(x, y, z, val); }
+  void mul(size_t x, size_t y, size_t z, value_type val) { array->mul(x, y, z, val); }
+  void div(size_t x, size_t y, size_t z, value_type val) { array->div(x, y, z, val); }
+};
+
+// thread-safe read-only view of 3D (sub)array with private cache
+template <class Container>
+class private_const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename container_type::store_type store_type;
+  typedef typename zfp::internal::dim3::const_reference<private_const_view> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<private_const_view> const_pointer;
+  typedef typename zfp::internal::dim3::const_iterator<private_const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_const_view(container_type* array, size_t cache_size = 0) :
+    preview<Container>(array),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+  private_const_view(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz, size_t cache_size = 0) :
+    preview<Container>(array, x, y, z, nx, ny, nz),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes) { cache.resize(bytes); }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i, j, k) inspector
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z); }
+  const_iterator cend() const { return const_iterator(this, x, y, z + nz); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<private_const_view>;
+  friend class zfp::internal::dim3::const_pointer<private_const_view>;
+  friend class zfp::internal::dim3::const_iterator<private_const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::min_y;
+  using preview<Container>::max_y;
+  using preview<Container>::min_z;
+  using preview<Container>::max_z;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z) const { return cache.get(x, y, z); }
+
+  BlockCache3<value_type, store_type> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 3D (sub)array
+template <class Container>
+class private_view : public private_const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim3::const_reference<private_view> const_reference;
+  typedef typename zfp::internal::dim3::const_pointer<private_view> const_pointer;
+  typedef typename zfp::internal::dim3::const_iterator<private_view> const_iterator;
+  typedef typename zfp::internal::dim3::reference<private_view> reference;
+  typedef typename zfp::internal::dim3::pointer<private_view> pointer;
+  typedef typename zfp::internal::dim3::iterator<private_view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_view(container_type* array, size_t cache_size = 0) : private_const_view<Container>(array, cache_size) {}
+  private_view(container_type* array, size_t x, size_t y, size_t z, size_t nx, size_t ny, size_t nz, size_t cache_size = 0) : private_const_view<Container>(array, x, y, z, nx, ny, nz, cache_size) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(size_t index, size_t count)
+  {
+    if (nx > std::max(ny, nz))
+      partition(x, nx, index, count);
+    else if (ny > std::max(nx, nz))
+      partition(y, ny, index, count);
+    else
+      partition(z, nz, index, count);
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // (i, j, k) inspector
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k); }
+
+  // (i, j, k) mutator
+  reference operator()(size_t i, size_t j, size_t k) { return reference(this, x + i, y + j, z + k); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z); }
+  const_iterator cend() const { return const_iterator(this, x, y, z + nz); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x, y, z); }
+  iterator end() { return iterator(this, x, y, z + nz); }
+
+protected:
+  friend class zfp::internal::dim3::const_handle<private_view>;
+  friend class zfp::internal::dim3::const_pointer<private_view>;
+  friend class zfp::internal::dim3::const_iterator<private_view>;
+  friend class zfp::internal::dim3::reference<private_view>;
+  friend class zfp::internal::dim3::pointer<private_view>;
+  friend class zfp::internal::dim3::iterator<private_view>;
+
+  using private_const_view<Container>::min_x;
+  using private_const_view<Container>::max_x;
+  using private_const_view<Container>::min_y;
+  using private_const_view<Container>::max_y;
+  using private_const_view<Container>::min_z;
+  using private_const_view<Container>::max_z;
+  using private_const_view<Container>::get;
+  using private_const_view<Container>::array;
+  using private_const_view<Container>::x;
+  using private_const_view<Container>::y;
+  using private_const_view<Container>::z;
+  using private_const_view<Container>::nx;
+  using private_const_view<Container>::ny;
+  using private_const_view<Container>::nz;
+  using private_const_view<Container>::cache;
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(size_t& offset, size_t& size, size_t index, size_t count)
+  {
+    size_t bmin = offset / 4;
+    size_t bmax = (offset + size + 3) / 4;
+    size_t xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    size_t xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, value_type val) { cache.set(x, y, z, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, value_type val) { cache.ref(x, y, z) += val; }
+  void sub(size_t x, size_t y, size_t z, value_type val) { cache.ref(x, y, z) -= val; }
+  void mul(size_t x, size_t y, size_t z, value_type val) { cache.ref(x, y, z) *= val; }
+  void div(size_t x, size_t y, size_t z, value_type val) { cache.ref(x, y, z) /= val; }
+};
+
+} // dim3
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/array/view4.hpp b/include/zfp/internal/array/view4.hpp
new file mode 100644
index 00000000..5888a305
--- /dev/null
+++ b/include/zfp/internal/array/view4.hpp
@@ -0,0 +1,679 @@
+#ifndef ZFP_VIEW4_HPP
+#define ZFP_VIEW4_HPP
+
+// 4D array views
+
+namespace zfp {
+namespace internal {
+namespace dim4 {
+
+// abstract view of 4D array (base class)
+template <class Container>
+class preview {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+
+  // rate in bits per value
+  double rate() const { return array->rate(); }
+
+  // dimensions of (sub)array
+  size_t size() const { return nx * ny * nz * nw; }
+
+  // local to global array indices
+  size_t global_x(size_t i) const { return x + i; }
+  size_t global_y(size_t j) const { return y + j; }
+  size_t global_z(size_t k) const { return z + k; }
+  size_t global_w(size_t l) const { return w + l; }
+
+protected:
+  // construction and assignment--perform shallow copy of (sub)array
+  explicit preview(container_type* array) : array(array), x(0), y(0), z(0), w(0), nx(array->size_x()), ny(array->size_y()), nz(array->size_z()), nw(array->size_w()) {}
+  explicit preview(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : array(array), x(x), y(y), z(z), w(w), nx(nx), ny(ny), nz(nz), nw(nw) {}
+  preview& operator=(container_type* a)
+  {
+    array = a;
+    x = y = z = w = 0;
+    nx = a->nx;
+    ny = a->ny;
+    nz = a->nz;
+    nw = a->nw;
+    return *this;
+  }
+
+  // global index bounds for iterators
+  size_t min_x() const { return x; }
+  size_t max_x() const { return x + nx; }
+  size_t min_y() const { return y; }
+  size_t max_y() const { return y + ny; }
+  size_t min_z() const { return z; }
+  size_t max_z() const { return z + nz; }
+  size_t min_w() const { return w; }
+  size_t max_w() const { return w + nw; }
+
+  container_type* array; // underlying container
+  size_t x, y, z, w;     // offset into array
+  size_t nx, ny, nz, nw; // dimensions of subarray
+};
+
+// generic read-only view into a rectangular subset of a 4D array
+template <class Container>
+class const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<const_view> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<const_view> const_pointer;
+  typedef typename zfp::internal::dim4::const_iterator<const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  const_view(container_type* array) : preview<Container>(array) {}
+  const_view(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : preview<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+  size_t size_w() const { return nw; }
+
+  // (i, j, k, l) inspector
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(this, x + i, y + j, z + k, w + l); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z, w); }
+  const_iterator cend() const { return const_iterator(this, x, y, z, w + nw); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<const_view>;
+  friend class zfp::internal::dim4::const_pointer<const_view>;
+  friend class zfp::internal::dim4::const_iterator<const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::min_y;
+  using preview<Container>::max_y;
+  using preview<Container>::min_z;
+  using preview<Container>::max_z;
+  using preview<Container>::min_w;
+  using preview<Container>::max_w;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::w;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+  using preview<Container>::nw;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return array->get(x, y, z, w); }
+};
+
+// generic read-write view into a rectangular subset of a 4D array
+template <class Container>
+class view : public const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<view> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<view> const_pointer;
+  typedef typename zfp::internal::dim4::const_iterator<view> const_iterator;
+  typedef typename zfp::internal::dim4::reference<view> reference;
+  typedef typename zfp::internal::dim4::pointer<view> pointer;
+  typedef typename zfp::internal::dim4::iterator<view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  view(container_type* array) : const_view<Container>(array) {}
+  view(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : const_view<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // (i, j, k, l) inspector
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(this, x + i, y + j, z + k, w + l); }
+
+  // (i, j, k, l) mutator
+  reference operator()(size_t i, size_t j, size_t k, size_t l) { return reference(this, x + i, y + j, z + k, w + l); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z, w); }
+  const_iterator cend() const { return const_iterator(this, x, y, z, w + nw); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x, y, z, w); }
+  iterator end() { return iterator(this, x, y, z, w + nw); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<view>;
+  friend class zfp::internal::dim4::const_pointer<view>;
+  friend class zfp::internal::dim4::const_iterator<view>;
+  friend class zfp::internal::dim4::reference<view>;
+  friend class zfp::internal::dim4::pointer<view>;
+  friend class zfp::internal::dim4::iterator<view>;
+
+  using const_view<Container>::min_x;
+  using const_view<Container>::max_x;
+  using const_view<Container>::min_y;
+  using const_view<Container>::max_y;
+  using const_view<Container>::min_z;
+  using const_view<Container>::max_z;
+  using const_view<Container>::min_w;
+  using const_view<Container>::max_w;
+  using const_view<Container>::get;
+  using const_view<Container>::array;
+  using const_view<Container>::x;
+  using const_view<Container>::y;
+  using const_view<Container>::z;
+  using const_view<Container>::w;
+  using const_view<Container>::nx;
+  using const_view<Container>::ny;
+  using const_view<Container>::nz;
+  using const_view<Container>::nw;
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { array->set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { array->add(x, y, z, w, val); }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { array->sub(x, y, z, w, val); }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { array->mul(x, y, z, w, val); }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { array->div(x, y, z, w, val); }
+};
+
+// flat view of 4D array (operator[] returns scalar)
+template <class Container>
+class flat_view : public view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<flat_view> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<flat_view> const_pointer;
+  typedef typename zfp::internal::dim4::reference<flat_view> reference;
+  typedef typename zfp::internal::dim4::pointer<flat_view> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  flat_view(container_type* array) : view<Container>(array) {}
+  flat_view(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : view<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // convert (i, j, k, l) index to flat index
+  size_t index(size_t i, size_t j, size_t k, size_t l) const { return i + nx * (j + ny * (k + nz * l)); }
+
+  // convert flat index to (i, j, k, l) index
+  void ijkl(size_t& i, size_t& j, size_t& k, size_t& l, size_t index) const
+  {
+    i = index % nx; index /= nx;
+    j = index % ny; index /= ny;
+    k = index % nz; index /= nz;
+    l = index;
+  }
+
+  // flat index [] inspector
+  const_reference operator[](size_t index) const
+  {
+    size_t i, j, k, l;
+    ijkl(i, j, k, l, index);
+    return const_reference(this, x + i, y + j, z + k, w + l);
+  }
+
+  // flat index [] mutator
+  reference operator[](size_t index)
+  {
+    size_t i, j, k, l;
+    ijkl(i, j, k, l, index);
+    return reference(this, x + i, y + j, z + k, w + l);
+  }
+
+  // (i, j, k, l) inspector
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(this, x + i, y + j, z + k, w + l); }
+
+  // (i, j, k, l) mutator
+  reference operator()(size_t i, size_t j, size_t k, size_t l) { return reference(this, x + i, y + j, z + k, w + l); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<flat_view>;
+  friend class zfp::internal::dim4::const_pointer<flat_view>;
+  friend class zfp::internal::dim4::reference<flat_view>;
+  friend class zfp::internal::dim4::pointer<flat_view>;
+
+  using view<Container>::array;
+  using view<Container>::x;
+  using view<Container>::y;
+  using view<Container>::z;
+  using view<Container>::w;
+  using view<Container>::nx;
+  using view<Container>::ny;
+  using view<Container>::nz;
+  using view<Container>::nw;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return array->get(x, y, z, w); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { array->set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { array->add(x, y, z, w, val); }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { array->sub(x, y, z, w, val); }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { array->mul(x, y, z, w, val); }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { array->div(x, y, z, w, val); }
+};
+
+// forward declaration of friends
+template <class Container> class nested_view1;
+template <class Container> class nested_view2;
+template <class Container> class nested_view3;
+template <class Container> class nested_view4;
+
+// nested view into a 1D rectangular subset of a 4D array
+template <class Container>
+class nested_view1 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<nested_view1> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<nested_view1> const_pointer;
+  typedef typename zfp::internal::dim4::reference<nested_view1> reference;
+  typedef typename zfp::internal::dim4::pointer<nested_view1> pointer;
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+
+  // [i] inspector and mutator
+  const_reference operator[](size_t index) const { return const_reference(this, x + index, y, z, w); }
+  reference operator[](size_t index) { return reference(this, x + index, y, z, w); }
+
+  // (i) inspector and mutator
+  const_reference operator()(size_t i) const { return const_reference(this, x + i, y, z, w); }
+  reference operator()(size_t i) { return reference(this, x + i, y, z, w); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<nested_view1>;
+  friend class zfp::internal::dim4::const_pointer<nested_view1>;
+  friend class zfp::internal::dim4::reference<nested_view1>;
+  friend class zfp::internal::dim4::pointer<nested_view1>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::w;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+  using preview<Container>::nw;
+
+  // construction--perform shallow copy of (sub)array
+  friend class nested_view2<Container>;
+  explicit nested_view1(container_type* array) : preview<Container>(array) {}
+  explicit nested_view1(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : preview<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return array->get(x, y, z, w); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { array->set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { array->add(x, y, z, w, val); }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { array->sub(x, y, z, w, val); }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { array->mul(x, y, z, w, val); }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { array->div(x, y, z, w, val); }
+};
+
+// nested view into a 2D rectangular subset of a 4D array
+template <class Container>
+class nested_view2 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<nested_view2> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<nested_view2> const_pointer;
+  typedef typename zfp::internal::dim4::reference<nested_view2> reference;
+  typedef typename zfp::internal::dim4::pointer<nested_view2> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  nested_view2(container_type* array) : preview<Container>(array) {}
+  nested_view2(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : preview<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+
+  // 1D view
+  nested_view1<Container> operator[](size_t index) const { return nested_view1<Container>(array, x, y + index, z, w, nx, 1, 1, 1); }
+
+  // (i, j) inspector and mutator
+  const_reference operator()(size_t i, size_t j) const { return const_reference(this, x + i, y + j, z, w); }
+  reference operator()(size_t i, size_t j) { return reference(this, x + i, y + j, z, w); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<nested_view2>;
+  friend class zfp::internal::dim4::const_pointer<nested_view2>;
+  friend class zfp::internal::dim4::reference<nested_view2>;
+  friend class zfp::internal::dim4::pointer<nested_view2>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::w;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+  using preview<Container>::nw;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return array->get(x, y, z, w); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { array->set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { array->add(x, y, z, w, val); }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { array->sub(x, y, z, w, val); }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { array->mul(x, y, z, w, val); }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { array->div(x, y, z, w, val); }
+};
+
+// nested view into a 3D rectangular subset of a 4D array
+template <class Container>
+class nested_view3 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<nested_view3> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<nested_view3> const_pointer;
+  typedef typename zfp::internal::dim4::reference<nested_view3> reference;
+  typedef typename zfp::internal::dim4::pointer<nested_view3> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  nested_view3(container_type* array) : preview<Container>(array) {}
+  nested_view3(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : preview<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+
+  // 2D view
+  nested_view2<Container> operator[](size_t index) const { return nested_view2<Container>(array, x, y, z + index, w, nx, ny, 1, 1); }
+
+  // (i, j, k) inspector and mutator
+  const_reference operator()(size_t i, size_t j, size_t k) const { return const_reference(this, x + i, y + j, z + k, w); }
+  reference operator()(size_t i, size_t j, size_t k) { return reference(this, x + i, y + j, z + k, w); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<nested_view3>;
+  friend class zfp::internal::dim4::const_pointer<nested_view3>;
+  friend class zfp::internal::dim4::reference<nested_view3>;
+  friend class zfp::internal::dim4::pointer<nested_view3>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::w;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+  using preview<Container>::nw;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return array->get(x, y, z, w); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { array->set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { array->add(x, y, z, w, val); }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { array->sub(x, y, z, w, val); }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { array->mul(x, y, z, w, val); }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { array->div(x, y, z, w, val); }
+};
+
+// nested view into a 4D rectangular subset of a 4D array
+template <class Container>
+class nested_view4 : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<nested_view4> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<nested_view4> const_pointer;
+  typedef typename zfp::internal::dim4::reference<nested_view4> reference;
+  typedef typename zfp::internal::dim4::pointer<nested_view4> pointer;
+
+  // construction--perform shallow copy of (sub)array
+  nested_view4(container_type* array) : preview<Container>(array) {}
+  nested_view4(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw) : preview<Container>(array, x, y, z, w, nx, ny, nz, nw) {}
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+  size_t size_w() const { return nw; }
+
+  // 3D view
+  nested_view3<Container> operator[](size_t index) const { return nested_view3<Container>(array, x, y, z, w + index, nx, ny, nz, 1); }
+
+  // (i, j, k, l) inspector and mutator
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(this, x + i, y + j, z + k, w + l); }
+  reference operator()(size_t i, size_t j, size_t k, size_t l) { return reference(this, x + i, y + j, z + k, w + l); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<nested_view4>;
+  friend class zfp::internal::dim4::const_pointer<nested_view4>;
+  friend class zfp::internal::dim4::reference<nested_view4>;
+  friend class zfp::internal::dim4::pointer<nested_view4>;
+
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::w;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+  using preview<Container>::nw;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return array->get(x, y, z, w); }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { array->set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { array->add(x, y, z, w, val); }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { array->sub(x, y, z, w, val); }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { array->mul(x, y, z, w, val); }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { array->div(x, y, z, w, val); }
+};
+
+// thread-safe read-only view of 4D (sub)array with private cache
+template <class Container>
+class private_const_view : public preview<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename container_type::store_type store_type;
+  typedef typename zfp::internal::dim4::const_reference<private_const_view> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<private_const_view> const_pointer;
+  typedef typename zfp::internal::dim4::const_iterator<private_const_view> const_iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_const_view(container_type* array, size_t cache_size = 0) :
+    preview<Container>(array),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+  private_const_view(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw, size_t cache_size = 0) :
+    preview<Container>(array, x, y, z, w, nx, ny, nz, nw),
+    cache(array->store, cache_size ? cache_size : array->cache.size())
+  {
+    array->store.reference();
+  }
+
+  // destructor
+  ~private_const_view()
+  {
+    array->store.unreference();
+  }
+
+  // dimensions of (sub)array
+  size_t size_x() const { return nx; }
+  size_t size_y() const { return ny; }
+  size_t size_z() const { return nz; }
+  size_t size_w() const { return nw; }
+
+  // cache size in number of bytes
+  size_t cache_size() const { return cache.size(); }
+
+  // set minimum cache size in bytes (array dimensions must be known)
+  void set_cache_size(size_t bytes) { cache.resize(bytes); }
+
+  // empty cache without compressing modified cached blocks
+  void clear_cache() const { cache.clear(); }
+
+  // (i, j, k) inspector
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(this, x + i, y + j, z + k, w + l); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z, w); }
+  const_iterator cend() const { return const_iterator(this, x, y, z, w + nw); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<private_const_view>;
+  friend class zfp::internal::dim4::const_pointer<private_const_view>;
+  friend class zfp::internal::dim4::const_iterator<private_const_view>;
+
+  using preview<Container>::min_x;
+  using preview<Container>::max_x;
+  using preview<Container>::min_y;
+  using preview<Container>::max_y;
+  using preview<Container>::min_z;
+  using preview<Container>::max_z;
+  using preview<Container>::min_w;
+  using preview<Container>::max_w;
+  using preview<Container>::array;
+  using preview<Container>::x;
+  using preview<Container>::y;
+  using preview<Container>::z;
+  using preview<Container>::w;
+  using preview<Container>::nx;
+  using preview<Container>::ny;
+  using preview<Container>::nz;
+  using preview<Container>::nw;
+
+  // inspector
+  value_type get(size_t x, size_t y, size_t z, size_t w) const { return cache.get(x, y, z, w); }
+
+  BlockCache4<value_type, store_type> cache; // cache of decompressed blocks
+};
+
+// thread-safe read-write view of private 4D (sub)array
+template <class Container>
+class private_view : public private_const_view<Container> {
+public:
+  typedef Container container_type;
+  typedef typename container_type::value_type value_type;
+  typedef typename zfp::internal::dim4::const_reference<private_view> const_reference;
+  typedef typename zfp::internal::dim4::const_pointer<private_view> const_pointer;
+  typedef typename zfp::internal::dim4::const_iterator<private_view> const_iterator;
+  typedef typename zfp::internal::dim4::reference<private_view> reference;
+  typedef typename zfp::internal::dim4::pointer<private_view> pointer;
+  typedef typename zfp::internal::dim4::iterator<private_view> iterator;
+
+  // construction--perform shallow copy of (sub)array
+  private_view(container_type* array, size_t cache_size = 0) : private_const_view<Container>(array, cache_size) {}
+  private_view(container_type* array, size_t x, size_t y, size_t z, size_t w, size_t nx, size_t ny, size_t nz, size_t nw, size_t cache_size = 0) : private_const_view<Container>(array, x, y, z, w, nx, ny, nz, nw, cache_size) {}
+
+  // partition view into count block-aligned pieces, with 0 <= index < count
+  void partition(size_t index, size_t count)
+  {
+    if (std::max(nx, ny) > std::max(nz, nw)) {
+      if (nx > ny)
+        partition(x, nx, index, count);
+      else
+        partition(y, ny, index, count);
+    }
+    else {
+      if (nz > nw)
+        partition(z, nz, index, count);
+      else
+        partition(w, nw, index, count);
+    }
+  }
+
+  // flush cache by compressing all modified cached blocks
+  void flush_cache() const { cache.flush(); }
+
+  // (i, j, k, l) inspector
+  const_reference operator()(size_t i, size_t j, size_t k, size_t l) const { return const_reference(this, x + i, y + j, z + k, w + l); }
+
+  // (i, j, k, l) mutator
+  reference operator()(size_t i, size_t j, size_t k, size_t l) { return reference(this, x + i, y + j, z + k, w + l); }
+
+  // random access iterators
+  const_iterator cbegin() const { return const_iterator(this, x, y, z, w); }
+  const_iterator cend() const { return const_iterator(this, x, y, z, w + nw); }
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  iterator begin() { return iterator(this, x, y, z, w); }
+  iterator end() { return iterator(this, x, y, z, w + nw); }
+
+protected:
+  friend class zfp::internal::dim4::const_handle<private_view>;
+  friend class zfp::internal::dim4::const_pointer<private_view>;
+  friend class zfp::internal::dim4::const_iterator<private_view>;
+  friend class zfp::internal::dim4::reference<private_view>;
+  friend class zfp::internal::dim4::pointer<private_view>;
+  friend class zfp::internal::dim4::iterator<private_view>;
+
+  using private_const_view<Container>::min_x;
+  using private_const_view<Container>::max_x;
+  using private_const_view<Container>::min_y;
+  using private_const_view<Container>::max_y;
+  using private_const_view<Container>::min_z;
+  using private_const_view<Container>::max_z;
+  using private_const_view<Container>::min_w;
+  using private_const_view<Container>::max_w;
+  using private_const_view<Container>::get;
+  using private_const_view<Container>::array;
+  using private_const_view<Container>::x;
+  using private_const_view<Container>::y;
+  using private_const_view<Container>::z;
+  using private_const_view<Container>::w;
+  using private_const_view<Container>::nx;
+  using private_const_view<Container>::ny;
+  using private_const_view<Container>::nz;
+  using private_const_view<Container>::nw;
+  using private_const_view<Container>::cache;
+
+  // block-aligned partition of [offset, offset + size): index out of count
+  static void partition(size_t& offset, size_t& size, size_t index, size_t count)
+  {
+    size_t bmin = offset / 4;
+    size_t bmax = (offset + size + 3) / 4;
+    size_t xmin = std::max(offset +    0, 4 * (bmin + (bmax - bmin) * (index + 0) / count));
+    size_t xmax = std::min(offset + size, 4 * (bmin + (bmax - bmin) * (index + 1) / count));
+    offset = xmin;
+    size = xmax - xmin;
+  }
+
+  // mutator
+  void set(size_t x, size_t y, size_t z, size_t w, value_type val) { cache.set(x, y, z, w, val); }
+
+  // in-place updates
+  void add(size_t x, size_t y, size_t z, size_t w, value_type val) { cache.ref(x, y, z, w) += val; }
+  void sub(size_t x, size_t y, size_t z, size_t w, value_type val) { cache.ref(x, y, z, w) -= val; }
+  void mul(size_t x, size_t y, size_t z, size_t w, value_type val) { cache.ref(x, y, z, w) *= val; }
+  void div(size_t x, size_t y, size_t z, size_t w, value_type val) { cache.ref(x, y, z, w) /= val; }
+};
+
+} // dim4
+} // internal
+} // zfp
+
+#endif
diff --git a/include/zfp/internal/cfp/array1d.h b/include/zfp/internal/cfp/array1d.h
new file mode 100644
index 00000000..65bddff3
--- /dev/null
+++ b/include/zfp/internal/cfp/array1d.h
@@ -0,0 +1,141 @@
+#ifndef CFP_ARRAY_1D_H
+#define CFP_ARRAY_1D_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array1d;
+
+typedef struct {
+  cfp_array1d array;
+  size_t x;
+} cfp_ref1d;
+
+typedef struct {
+  cfp_ref1d reference;
+} cfp_ptr1d;
+
+typedef struct {
+  cfp_array1d array;
+  size_t x;
+} cfp_iter1d;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ref1d self);
+  void (*set)(cfp_ref1d self, double val);
+  cfp_ptr1d (*ptr)(cfp_ref1d self);
+  void (*copy)(cfp_ref1d self, const cfp_ref1d src);
+} cfp_ref1d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ptr1d self);
+  double (*get_at)(const cfp_ptr1d self, ptrdiff_t d);
+  void (*set)(cfp_ptr1d self, double val);
+  void (*set_at)(cfp_ptr1d self, ptrdiff_t d, double val);
+  cfp_ref1d (*ref)(cfp_ptr1d self);
+  cfp_ref1d (*ref_at)(cfp_ptr1d self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr1d lhs, const cfp_ptr1d rhs);
+  zfp_bool (*gt)(const cfp_ptr1d lhs, const cfp_ptr1d rhs);
+  zfp_bool (*leq)(const cfp_ptr1d lhs, const cfp_ptr1d rhs);
+  zfp_bool (*geq)(const cfp_ptr1d lhs, const cfp_ptr1d rhs);
+  zfp_bool (*eq)(const cfp_ptr1d lhs, const cfp_ptr1d rhs);
+  zfp_bool (*neq)(const cfp_ptr1d lhs, const cfp_ptr1d rhs);
+  ptrdiff_t (*distance)(const cfp_ptr1d first, const cfp_ptr1d last);
+  cfp_ptr1d (*next)(const cfp_ptr1d p, ptrdiff_t d);
+  cfp_ptr1d (*prev)(const cfp_ptr1d p, ptrdiff_t d);
+  cfp_ptr1d (*inc)(const cfp_ptr1d p);
+  cfp_ptr1d (*dec)(const cfp_ptr1d p);
+} cfp_ptr1d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_iter1d self);
+  double (*get_at)(const cfp_iter1d self, ptrdiff_t d);
+  void (*set)(cfp_iter1d self, double val);
+  void (*set_at)(cfp_iter1d self, ptrdiff_t d, double val);
+  cfp_ref1d (*ref)(cfp_iter1d self);
+  cfp_ref1d (*ref_at)(cfp_iter1d self, ptrdiff_t d);
+  cfp_ptr1d (*ptr)(cfp_iter1d self);
+  cfp_ptr1d (*ptr_at)(cfp_iter1d self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter1d self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter1d lhs, const cfp_iter1d rhs);
+  zfp_bool (*gt)(const cfp_iter1d lhs, const cfp_iter1d rhs);
+  zfp_bool (*leq)(const cfp_iter1d lhs, const cfp_iter1d rhs);
+  zfp_bool (*geq)(const cfp_iter1d lhs, const cfp_iter1d rhs);
+  zfp_bool (*eq)(const cfp_iter1d lhs, const cfp_iter1d rhs);
+  zfp_bool (*neq)(const cfp_iter1d lhs, const cfp_iter1d rhs);
+  ptrdiff_t (*distance)(const cfp_iter1d first, const cfp_iter1d last);
+  cfp_iter1d (*next)(const cfp_iter1d it, ptrdiff_t d);
+  cfp_iter1d (*prev)(const cfp_iter1d it, ptrdiff_t d);
+  cfp_iter1d (*inc)(const cfp_iter1d it);
+  cfp_iter1d (*dec)(const cfp_iter1d it);
+} cfp_iter1d_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array1d a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header1d_api;
+
+typedef struct {
+  cfp_array1d (*ctor_default)();
+  cfp_array1d (*ctor)(size_t n, double rate, const double* p, size_t cache_size);
+  cfp_array1d (*ctor_copy)(const cfp_array1d src);
+  cfp_array1d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array1d self);
+
+  void (*deep_copy)(cfp_array1d self, const cfp_array1d src);
+
+  double (*rate)(const cfp_array1d self);
+  double (*set_rate)(cfp_array1d self, double rate);
+  size_t (*cache_size)(const cfp_array1d self);
+  void (*set_cache_size)(cfp_array1d self, size_t bytes);
+  void (*clear_cache)(const cfp_array1d self);
+  void (*flush_cache)(const cfp_array1d self);
+  size_t (*size_bytes)(const cfp_array1d self, uint mask);
+  size_t (*compressed_size)(const cfp_array1d self);
+  void* (*compressed_data)(const cfp_array1d self);
+  size_t (*size)(const cfp_array1d self);
+  void (*resize)(cfp_array1d self, size_t n, zfp_bool clear);
+
+  void (*get_array)(const cfp_array1d self, double* p);
+  void (*set_array)(cfp_array1d self, const double* p);
+  double (*get_flat)(const cfp_array1d self, size_t i);
+  void (*set_flat)(cfp_array1d self, size_t i, double val);
+  double (*get)(const cfp_array1d self, size_t i);
+  void (*set)(cfp_array1d self, size_t i, double val);
+
+  cfp_ref1d (*ref)(cfp_array1d self, size_t i);
+  cfp_ref1d (*ref_flat)(cfp_array1d self, size_t i);
+
+  cfp_ptr1d (*ptr)(cfp_array1d self, size_t i);
+  cfp_ptr1d (*ptr_flat)(cfp_array1d self, size_t i);
+
+  cfp_iter1d (*begin)(cfp_array1d self);
+  cfp_iter1d (*end)(cfp_array1d self);
+
+  cfp_ref1d_api reference;
+  cfp_ptr1d_api pointer;
+  cfp_iter1d_api iterator;
+  cfp_header1d_api header;
+} cfp_array1d_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array1f.h b/include/zfp/internal/cfp/array1f.h
new file mode 100644
index 00000000..3f4d21e4
--- /dev/null
+++ b/include/zfp/internal/cfp/array1f.h
@@ -0,0 +1,141 @@
+#ifndef CFP_ARRAY_1F_H
+#define CFP_ARRAY_1F_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array1f;
+
+typedef struct {
+  cfp_array1f array;
+  size_t x;
+} cfp_ref1f;
+
+typedef struct {
+  cfp_ref1f reference;
+} cfp_ptr1f;
+
+typedef struct {
+  cfp_array1f array;
+  size_t x;
+} cfp_iter1f;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ref1f self);
+  void (*set)(cfp_ref1f self, float val);
+  cfp_ptr1f (*ptr)(cfp_ref1f self);
+  void (*copy)(cfp_ref1f self, const cfp_ref1f src);
+} cfp_ref1f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ptr1f self);
+  float (*get_at)(const cfp_ptr1f self, ptrdiff_t d);
+  void (*set)(cfp_ptr1f self, float val);
+  void (*set_at)(cfp_ptr1f self, ptrdiff_t d, float val);
+  cfp_ref1f (*ref)(cfp_ptr1f self);
+  cfp_ref1f (*ref_at)(cfp_ptr1f self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr1f lhs, const cfp_ptr1f rhs);
+  zfp_bool (*gt)(const cfp_ptr1f lhs, const cfp_ptr1f rhs);
+  zfp_bool (*leq)(const cfp_ptr1f lhs, const cfp_ptr1f rhs);
+  zfp_bool (*geq)(const cfp_ptr1f lhs, const cfp_ptr1f rhs);
+  zfp_bool (*eq)(const cfp_ptr1f lhs, const cfp_ptr1f rhs);
+  zfp_bool (*neq)(const cfp_ptr1f lhs, const cfp_ptr1f rhs);
+  ptrdiff_t (*distance)(const cfp_ptr1f first, const cfp_ptr1f last);
+  cfp_ptr1f (*next)(const cfp_ptr1f p, ptrdiff_t d);
+  cfp_ptr1f (*prev)(const cfp_ptr1f p, ptrdiff_t d);
+  cfp_ptr1f (*inc)(const cfp_ptr1f p);
+  cfp_ptr1f (*dec)(const cfp_ptr1f p);
+} cfp_ptr1f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_iter1f self);
+  float (*get_at)(const cfp_iter1f self, ptrdiff_t d);
+  void (*set)(cfp_iter1f self, float val);
+  void (*set_at)(cfp_iter1f self, ptrdiff_t d, float val);
+  cfp_ref1f (*ref)(cfp_iter1f self);
+  cfp_ref1f (*ref_at)(cfp_iter1f self, ptrdiff_t d);
+  cfp_ptr1f (*ptr)(cfp_iter1f self);
+  cfp_ptr1f (*ptr_at)(cfp_iter1f self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter1f self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter1f lhs, const cfp_iter1f rhs);
+  zfp_bool (*gt)(const cfp_iter1f lhs, const cfp_iter1f rhs);
+  zfp_bool (*leq)(const cfp_iter1f lhs, const cfp_iter1f rhs);
+  zfp_bool (*geq)(const cfp_iter1f lhs, const cfp_iter1f rhs);
+  zfp_bool (*eq)(const cfp_iter1f lhs, const cfp_iter1f rhs);
+  zfp_bool (*neq)(const cfp_iter1f lhs, const cfp_iter1f rhs);
+  ptrdiff_t (*distance)(const cfp_iter1f first, const cfp_iter1f last);
+  cfp_iter1f (*next)(const cfp_iter1f it, ptrdiff_t d);
+  cfp_iter1f (*prev)(const cfp_iter1f it, ptrdiff_t d);
+  cfp_iter1f (*inc)(const cfp_iter1f it);
+  cfp_iter1f (*dec)(const cfp_iter1f it);
+} cfp_iter1f_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array1f a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header1f_api;
+
+typedef struct {
+  cfp_array1f (*ctor_default)();
+  cfp_array1f (*ctor)(size_t n, double rate, const float* p, size_t cache_size);
+  cfp_array1f (*ctor_copy)(const cfp_array1f src);
+  cfp_array1f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array1f self);
+
+  void (*deep_copy)(cfp_array1f self, const cfp_array1f src);
+
+  double (*rate)(const cfp_array1f self);
+  double (*set_rate)(cfp_array1f self, double rate);
+  size_t (*cache_size)(const cfp_array1f self);
+  void (*set_cache_size)(cfp_array1f self, size_t bytes);
+  void (*clear_cache)(const cfp_array1f self);
+  void (*flush_cache)(const cfp_array1f self);
+  size_t (*size_bytes)(const cfp_array1f self, uint mask);
+  size_t (*compressed_size)(const cfp_array1f self);
+  void* (*compressed_data)(const cfp_array1f self);
+  size_t (*size)(const cfp_array1f self);
+  void (*resize)(cfp_array1f self, size_t n, zfp_bool clear);
+
+  void (*get_array)(const cfp_array1f self, float* p);
+  void (*set_array)(cfp_array1f self, const float* p);
+  float (*get_flat)(const cfp_array1f self, size_t i);
+  void (*set_flat)(cfp_array1f self, size_t i, float val);
+  float (*get)(const cfp_array1f self, size_t i);
+  void (*set)(cfp_array1f self, size_t i, float val);
+
+  cfp_ref1f (*ref)(cfp_array1f self, size_t i);
+  cfp_ref1f (*ref_flat)(cfp_array1f self, size_t i);
+
+  cfp_ptr1f (*ptr)(cfp_array1f self, size_t i);
+  cfp_ptr1f (*ptr_flat)(cfp_array1f self, size_t i);
+
+  cfp_iter1f (*begin)(cfp_array1f self);
+  cfp_iter1f (*end)(cfp_array1f self);
+
+  cfp_ref1f_api reference;
+  cfp_ptr1f_api pointer;
+  cfp_iter1f_api iterator;
+  cfp_header1f_api header;
+} cfp_array1f_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array2d.h b/include/zfp/internal/cfp/array2d.h
new file mode 100644
index 00000000..9b070987
--- /dev/null
+++ b/include/zfp/internal/cfp/array2d.h
@@ -0,0 +1,144 @@
+#ifndef CFP_ARRAY_2D_H
+#define CFP_ARRAY_2D_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array2d;
+
+typedef struct {
+  cfp_array2d array;
+  size_t x, y;
+} cfp_ref2d;
+
+typedef struct {
+  cfp_ref2d reference;
+} cfp_ptr2d;
+
+typedef struct {
+  cfp_array2d array;
+  size_t x, y;
+} cfp_iter2d;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ref2d self);
+  void (*set)(cfp_ref2d self, double val);
+  cfp_ptr2d (*ptr)(cfp_ref2d self);
+  void (*copy)(cfp_ref2d self, const cfp_ref2d src);
+} cfp_ref2d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ptr2d self);
+  double (*get_at)(const cfp_ptr2d self, ptrdiff_t d);
+  void (*set)(cfp_ptr2d self, double val);
+  void (*set_at)(cfp_ptr2d self, ptrdiff_t d, double val);
+  cfp_ref2d (*ref)(cfp_ptr2d self);
+  cfp_ref2d (*ref_at)(cfp_ptr2d self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr2d lhs, const cfp_ptr2d rhs);
+  zfp_bool (*gt)(const cfp_ptr2d lhs, const cfp_ptr2d rhs);
+  zfp_bool (*leq)(const cfp_ptr2d lhs, const cfp_ptr2d rhs);
+  zfp_bool (*geq)(const cfp_ptr2d lhs, const cfp_ptr2d rhs);
+  zfp_bool (*eq)(const cfp_ptr2d lhs, const cfp_ptr2d rhs);
+  zfp_bool (*neq)(const cfp_ptr2d lhs, const cfp_ptr2d rhs);
+  ptrdiff_t (*distance)(const cfp_ptr2d first, const cfp_ptr2d last);
+  cfp_ptr2d (*next)(const cfp_ptr2d p, ptrdiff_t d);
+  cfp_ptr2d (*prev)(const cfp_ptr2d p, ptrdiff_t d);
+  cfp_ptr2d (*inc)(const cfp_ptr2d p);
+  cfp_ptr2d (*dec)(const cfp_ptr2d p);
+} cfp_ptr2d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_iter2d self);
+  double (*get_at)(const cfp_iter2d self, ptrdiff_t d);
+  void (*set)(cfp_iter2d self, double value);
+  void (*set_at)(cfp_iter2d self, ptrdiff_t d, double value);
+  cfp_ref2d (*ref)(cfp_iter2d self);
+  cfp_ref2d (*ref_at)(cfp_iter2d self, ptrdiff_t d);
+  cfp_ptr2d (*ptr)(cfp_iter2d self);
+  cfp_ptr2d (*ptr_at)(cfp_iter2d self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter2d self);
+  size_t (*j)(const cfp_iter2d self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter2d lhs, const cfp_iter2d rhs);
+  zfp_bool (*gt)(const cfp_iter2d lhs, const cfp_iter2d rhs);
+  zfp_bool (*leq)(const cfp_iter2d lhs, const cfp_iter2d rhs);
+  zfp_bool (*geq)(const cfp_iter2d lhs, const cfp_iter2d rhs);
+  zfp_bool (*eq)(const cfp_iter2d lhs, const cfp_iter2d rhs);
+  zfp_bool (*neq)(const cfp_iter2d lhs, const cfp_iter2d rhs);
+  ptrdiff_t (*distance)(const cfp_iter2d fist, const cfp_iter2d last);
+  cfp_iter2d (*next)(const cfp_iter2d it, ptrdiff_t d);
+  cfp_iter2d (*prev)(const cfp_iter2d it, ptrdiff_t d);
+  cfp_iter2d (*inc)(const cfp_iter2d it);
+  cfp_iter2d (*dec)(const cfp_iter2d it);
+} cfp_iter2d_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array2d a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header2d_api;
+
+typedef struct {
+  cfp_array2d (*ctor_default)();
+  cfp_array2d (*ctor)(size_t nx, size_t ny, double rate, const double* p, size_t cache_size);
+  cfp_array2d (*ctor_copy)(const cfp_array2d src);
+  cfp_array2d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array2d self);
+
+  void (*deep_copy)(cfp_array2d self, const cfp_array2d src);
+
+  double (*rate)(const cfp_array2d self);
+  double (*set_rate)(cfp_array2d self, double rate);
+  size_t (*cache_size)(const cfp_array2d self);
+  void (*set_cache_size)(cfp_array2d self, size_t bytes);
+  void (*clear_cache)(const cfp_array2d self);
+  void (*flush_cache)(const cfp_array2d self);
+  size_t (*size_bytes)(const cfp_array2d self, uint mask);
+  size_t (*compressed_size)(const cfp_array2d self);
+  void* (*compressed_data)(const cfp_array2d self);
+  size_t (*size)(const cfp_array2d self);
+  size_t (*size_x)(const cfp_array2d self);
+  size_t (*size_y)(const cfp_array2d self);
+  void (*resize)(cfp_array2d self, size_t nx, size_t ny, zfp_bool clear);
+
+  void (*get_array)(const cfp_array2d self, double* p);
+  void (*set_array)(cfp_array2d self, const double* p);
+  double (*get_flat)(const cfp_array2d self, size_t i);
+  void (*set_flat)(cfp_array2d self, size_t i, double val);
+  double (*get)(const cfp_array2d self, size_t i, size_t j);
+  void (*set)(cfp_array2d self, size_t i, size_t j, double val);
+
+  cfp_ref2d (*ref)(cfp_array2d self, size_t i, size_t j);
+  cfp_ref2d (*ref_flat)(cfp_array2d self, size_t i);
+
+  cfp_ptr2d (*ptr)(cfp_array2d self, size_t i, size_t j);
+  cfp_ptr2d (*ptr_flat)(cfp_array2d self, size_t i);
+
+  cfp_iter2d (*begin)(cfp_array2d self);
+  cfp_iter2d (*end)(cfp_array2d self);
+
+  cfp_ref2d_api reference;
+  cfp_ptr2d_api pointer;
+  cfp_iter2d_api iterator;
+  cfp_header2d_api header;
+} cfp_array2d_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array2f.h b/include/zfp/internal/cfp/array2f.h
new file mode 100644
index 00000000..85bf584e
--- /dev/null
+++ b/include/zfp/internal/cfp/array2f.h
@@ -0,0 +1,144 @@
+#ifndef CFP_ARRAY_2F_H
+#define CFP_ARRAY_2F_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array2f;
+
+typedef struct {
+  cfp_array2f array;
+  size_t x, y;
+} cfp_ref2f;
+
+typedef struct {
+  cfp_ref2f reference;
+} cfp_ptr2f;
+
+typedef struct {
+  cfp_array2f array;
+  size_t x, y;
+} cfp_iter2f;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ref2f self);
+  void (*set)(cfp_ref2f self, float val);
+  cfp_ptr2f (*ptr)(cfp_ref2f self);
+  void (*copy)(cfp_ref2f self, const cfp_ref2f src);
+} cfp_ref2f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ptr2f self);
+  float (*get_at)(const cfp_ptr2f self, ptrdiff_t d);
+  void (*set)(cfp_ptr2f self, float val);
+  void (*set_at)(cfp_ptr2f self, ptrdiff_t d, float val);
+  cfp_ref2f (*ref)(cfp_ptr2f self);
+  cfp_ref2f (*ref_at)(cfp_ptr2f self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr2f lhs, const cfp_ptr2f rhs);
+  zfp_bool (*gt)(const cfp_ptr2f lhs, const cfp_ptr2f rhs);
+  zfp_bool (*leq)(const cfp_ptr2f lhs, const cfp_ptr2f rhs);
+  zfp_bool (*geq)(const cfp_ptr2f lhs, const cfp_ptr2f rhs);
+  zfp_bool (*eq)(const cfp_ptr2f lhs, const cfp_ptr2f rhs);
+  zfp_bool (*neq)(const cfp_ptr2f lhs, const cfp_ptr2f rhs);
+  ptrdiff_t (*distance)(const cfp_ptr2f first, const cfp_ptr2f last);
+  cfp_ptr2f (*next)(const cfp_ptr2f p, ptrdiff_t d);
+  cfp_ptr2f (*prev)(const cfp_ptr2f p, ptrdiff_t d);
+  cfp_ptr2f (*inc)(const cfp_ptr2f p);
+  cfp_ptr2f (*dec)(const cfp_ptr2f p);
+} cfp_ptr2f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_iter2f self);
+  float (*get_at)(const cfp_iter2f self, ptrdiff_t d);
+  void (*set)(cfp_iter2f self, float val);
+  void (*set_at)(cfp_iter2f self, ptrdiff_t d, float val);
+  cfp_ref2f (*ref)(cfp_iter2f self);
+  cfp_ref2f (*ref_at)(cfp_iter2f self, ptrdiff_t d);
+  cfp_ptr2f (*ptr)(cfp_iter2f self);
+  cfp_ptr2f (*ptr_at)(cfp_iter2f self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter2f self);
+  size_t (*j)(const cfp_iter2f self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter2f lhs, const cfp_iter2f rhs);
+  zfp_bool (*gt)(const cfp_iter2f lhs, const cfp_iter2f rhs);
+  zfp_bool (*leq)(const cfp_iter2f lhs, const cfp_iter2f rhs);
+  zfp_bool (*geq)(const cfp_iter2f lhs, const cfp_iter2f rhs);
+  zfp_bool (*eq)(const cfp_iter2f lhs, const cfp_iter2f rhs);
+  zfp_bool (*neq)(const cfp_iter2f lhs, const cfp_iter2f rhs);
+  ptrdiff_t (*distance)(const cfp_iter2f first, const cfp_iter2f last);
+  cfp_iter2f (*next)(const cfp_iter2f it, ptrdiff_t d);
+  cfp_iter2f (*prev)(const cfp_iter2f it, ptrdiff_t d);
+  cfp_iter2f (*inc)(const cfp_iter2f it);
+  cfp_iter2f (*dec)(const cfp_iter2f it);
+} cfp_iter2f_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array2f a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header2f_api;
+
+typedef struct {
+  cfp_array2f (*ctor_default)();
+  cfp_array2f (*ctor)(size_t nx, size_t ny, double rate, const float* p, size_t cache_size);
+  cfp_array2f (*ctor_copy)(const cfp_array2f src);
+  cfp_array2f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array2f self);
+
+  void (*deep_copy)(cfp_array2f self, const cfp_array2f src);
+
+  double (*rate)(const cfp_array2f self);
+  double (*set_rate)(cfp_array2f self, double rate);
+  size_t (*cache_size)(const cfp_array2f self);
+  void (*set_cache_size)(cfp_array2f self, size_t bytes);
+  void (*clear_cache)(const cfp_array2f self);
+  void (*flush_cache)(const cfp_array2f self);
+  size_t (*size_bytes)(const cfp_array2f self, uint mask);
+  size_t (*compressed_size)(const cfp_array2f self);
+  void* (*compressed_data)(const cfp_array2f self);
+  size_t (*size)(const cfp_array2f self);
+  size_t (*size_x)(const cfp_array2f self);
+  size_t (*size_y)(const cfp_array2f self);
+  void (*resize)(cfp_array2f self, size_t nx, size_t ny, zfp_bool clear);
+
+  void (*get_array)(const cfp_array2f self, float* p);
+  void (*set_array)(cfp_array2f self, const float* p);
+  float (*get_flat)(const cfp_array2f self, size_t i);
+  void (*set_flat)(cfp_array2f self, size_t i, float val);
+  float (*get)(const cfp_array2f self, size_t i, size_t j);
+  void (*set)(cfp_array2f self, size_t i, size_t j, float val);
+
+  cfp_ref2f (*ref)(cfp_array2f self, size_t i, size_t j);
+  cfp_ref2f (*ref_flat)(cfp_array2f self, size_t i);
+
+  cfp_ptr2f (*ptr)(cfp_array2f self, size_t i, size_t j);
+  cfp_ptr2f (*ptr_flat)(cfp_array2f self, size_t i);
+
+  cfp_iter2f (*begin)(cfp_array2f self);
+  cfp_iter2f (*end)(cfp_array2f self);
+
+  cfp_ref2f_api reference;
+  cfp_ptr2f_api pointer;
+  cfp_iter2f_api iterator;
+  cfp_header2f_api header;
+} cfp_array2f_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array3d.h b/include/zfp/internal/cfp/array3d.h
new file mode 100644
index 00000000..c3c337f6
--- /dev/null
+++ b/include/zfp/internal/cfp/array3d.h
@@ -0,0 +1,146 @@
+#ifndef CFP_ARRAY_3D_H
+#define CFP_ARRAY_3D_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array3d;
+
+typedef struct {
+  cfp_array3d array;
+  size_t x, y, z;
+} cfp_ref3d;
+
+typedef struct {
+  cfp_ref3d reference;
+} cfp_ptr3d;
+
+typedef struct {
+  cfp_array3d array;
+  size_t x, y, z;
+} cfp_iter3d;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ref3d self);
+  void (*set)(cfp_ref3d self, double val);
+  cfp_ptr3d (*ptr)(cfp_ref3d self);
+  void (*copy)(cfp_ref3d self, const cfp_ref3d src);
+} cfp_ref3d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ptr3d self);
+  double (*get_at)(const cfp_ptr3d self, ptrdiff_t d);
+  void (*set)(cfp_ptr3d self, double val);
+  void (*set_at)(cfp_ptr3d self, ptrdiff_t d, double val);
+  cfp_ref3d (*ref)(cfp_ptr3d self);
+  cfp_ref3d (*ref_at)(cfp_ptr3d self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr3d lhs, const cfp_ptr3d rhs);
+  zfp_bool (*gt)(const cfp_ptr3d lhs, const cfp_ptr3d rhs);
+  zfp_bool (*leq)(const cfp_ptr3d lhs, const cfp_ptr3d rhs);
+  zfp_bool (*geq)(const cfp_ptr3d lhs, const cfp_ptr3d rhs);
+  zfp_bool (*eq)(const cfp_ptr3d lhs, const cfp_ptr3d rhs);
+  zfp_bool (*neq)(const cfp_ptr3d lhs, const cfp_ptr3d rhs);
+  ptrdiff_t (*distance)(const cfp_ptr3d first, const cfp_ptr3d last);
+  cfp_ptr3d (*next)(const cfp_ptr3d p, ptrdiff_t d);
+  cfp_ptr3d (*prev)(const cfp_ptr3d p, ptrdiff_t d);
+  cfp_ptr3d (*inc)(const cfp_ptr3d p);
+  cfp_ptr3d (*dec)(const cfp_ptr3d p);
+} cfp_ptr3d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_iter3d self);
+  double (*get_at)(const cfp_iter3d self, ptrdiff_t d);
+  void (*set)(cfp_iter3d self, double val);
+  void (*set_at)(cfp_iter3d self, ptrdiff_t d, double val);
+  cfp_ref3d (*ref)(cfp_iter3d self);
+  cfp_ref3d (*ref_at)(cfp_iter3d self, ptrdiff_t d);
+  cfp_ptr3d (*ptr)(cfp_iter3d self);
+  cfp_ptr3d (*ptr_at)(cfp_iter3d self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter3d self);
+  size_t (*j)(const cfp_iter3d self);
+  size_t (*k)(const cfp_iter3d self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter3d lhs, const cfp_iter3d rhs);
+  zfp_bool (*gt)(const cfp_iter3d lhs, const cfp_iter3d rhs);
+  zfp_bool (*leq)(const cfp_iter3d lhs, const cfp_iter3d rhs);
+  zfp_bool (*geq)(const cfp_iter3d lhs, const cfp_iter3d rhs);
+  zfp_bool (*eq)(const cfp_iter3d lhs, const cfp_iter3d rhs);
+  zfp_bool (*neq)(const cfp_iter3d lhs, const cfp_iter3d rhs);
+  ptrdiff_t (*distance)(const cfp_iter3d first, const cfp_iter3d last);
+  cfp_iter3d (*next)(const cfp_iter3d it, ptrdiff_t d);
+  cfp_iter3d (*prev)(const cfp_iter3d it, ptrdiff_t d);
+  cfp_iter3d (*inc)(const cfp_iter3d it);
+  cfp_iter3d (*dec)(const cfp_iter3d it);
+} cfp_iter3d_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array3d a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header3d_api;
+
+typedef struct {
+  cfp_array3d (*ctor_default)();
+  cfp_array3d (*ctor)(size_t nx, size_t ny, size_t nz, double rate, const double* p, size_t cache_size);
+  cfp_array3d (*ctor_copy)(const cfp_array3d src);
+  cfp_array3d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array3d self);
+
+  void (*deep_copy)(cfp_array3d self, const cfp_array3d src);
+
+  double (*rate)(const cfp_array3d self);
+  double (*set_rate)(cfp_array3d self, double rate);
+  size_t (*cache_size)(const cfp_array3d self);
+  void (*set_cache_size)(cfp_array3d self, size_t bytes);
+  void (*clear_cache)(const cfp_array3d self);
+  void (*flush_cache)(const cfp_array3d self);
+  size_t (*size_bytes)(const cfp_array3d self, uint mask);
+  size_t (*compressed_size)(const cfp_array3d self);
+  void* (*compressed_data)(const cfp_array3d self);
+  size_t (*size)(const cfp_array3d self);
+  size_t (*size_x)(const cfp_array3d self);
+  size_t (*size_y)(const cfp_array3d self);
+  size_t (*size_z)(const cfp_array3d self);
+  void (*resize)(cfp_array3d self, size_t nx, size_t ny, size_t nz, zfp_bool clear);
+
+  void (*get_array)(const cfp_array3d self, double* p);
+  void (*set_array)(cfp_array3d self, const double* p);
+  double (*get_flat)(const cfp_array3d self, size_t i);
+  void (*set_flat)(cfp_array3d self, size_t i, double val);
+  double (*get)(const cfp_array3d self, size_t i, size_t j, size_t k);
+  void (*set)(cfp_array3d self, size_t i, size_t j, size_t k, double val);
+
+  cfp_ref3d (*ref)(cfp_array3d self, size_t i, size_t j, size_t k);
+  cfp_ref3d (*ref_flat)(cfp_array3d self, size_t i);
+
+  cfp_ptr3d (*ptr)(cfp_array3d self, size_t i, size_t j, size_t k);
+  cfp_ptr3d (*ptr_flat)(cfp_array3d self, size_t i);
+
+  cfp_iter3d (*begin)(cfp_array3d self);
+  cfp_iter3d (*end)(cfp_array3d self);
+
+  cfp_ref3d_api reference;
+  cfp_ptr3d_api pointer;
+  cfp_iter3d_api iterator;
+  cfp_header3d_api header;
+} cfp_array3d_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array3f.h b/include/zfp/internal/cfp/array3f.h
new file mode 100644
index 00000000..43fbe722
--- /dev/null
+++ b/include/zfp/internal/cfp/array3f.h
@@ -0,0 +1,146 @@
+#ifndef CFP_ARRAY_3F_H
+#define CFP_ARRAY_3F_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array3f;
+
+typedef struct {
+  cfp_array3f array;
+  size_t x, y, z;
+} cfp_ref3f;
+
+typedef struct {
+  cfp_ref3f reference;
+} cfp_ptr3f;
+
+typedef struct {
+  cfp_array3f array;
+  size_t x, y, z;
+} cfp_iter3f;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ref3f self);
+  void (*set)(cfp_ref3f self, float val);
+  cfp_ptr3f (*ptr)(cfp_ref3f self);
+  void (*copy)(cfp_ref3f self, const cfp_ref3f src);
+} cfp_ref3f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ptr3f self);
+  float (*get_at)(const cfp_ptr3f self, ptrdiff_t d);
+  void (*set)(cfp_ptr3f self, float val);
+  void (*set_at)(cfp_ptr3f self, ptrdiff_t d, float val);
+  cfp_ref3f (*ref)(cfp_ptr3f self);
+  cfp_ref3f (*ref_at)(cfp_ptr3f self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr3f lhs, const cfp_ptr3f rhs);
+  zfp_bool (*gt)(const cfp_ptr3f lhs, const cfp_ptr3f rhs);
+  zfp_bool (*leq)(const cfp_ptr3f lhs, const cfp_ptr3f rhs);
+  zfp_bool (*geq)(const cfp_ptr3f lhs, const cfp_ptr3f rhs);
+  zfp_bool (*eq)(const cfp_ptr3f lhs, const cfp_ptr3f rhs);
+  zfp_bool (*neq)(const cfp_ptr3f lhs, const cfp_ptr3f rhs);
+  ptrdiff_t (*distance)(const cfp_ptr3f first, const cfp_ptr3f last);
+  cfp_ptr3f (*next)(const cfp_ptr3f p, ptrdiff_t d);
+  cfp_ptr3f (*prev)(const cfp_ptr3f p, ptrdiff_t d);
+  cfp_ptr3f (*inc)(const cfp_ptr3f p);
+  cfp_ptr3f (*dec)(const cfp_ptr3f p);
+} cfp_ptr3f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_iter3f self);
+  float (*get_at)(const cfp_iter3f self, ptrdiff_t d);
+  void (*set)(cfp_iter3f self, float val);
+  void (*set_at)(cfp_iter3f self, ptrdiff_t d, float val);
+  cfp_ref3f (*ref)(cfp_iter3f self);
+  cfp_ref3f (*ref_at)(cfp_iter3f self, ptrdiff_t d);
+  cfp_ptr3f (*ptr)(cfp_iter3f self);
+  cfp_ptr3f (*ptr_at)(cfp_iter3f self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter3f self);
+  size_t (*j)(const cfp_iter3f self);
+  size_t (*k)(const cfp_iter3f self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter3f lhs, const cfp_iter3f rhs);
+  zfp_bool (*gt)(const cfp_iter3f lhs, const cfp_iter3f rhs);
+  zfp_bool (*leq)(const cfp_iter3f lhs, const cfp_iter3f rhs);
+  zfp_bool (*geq)(const cfp_iter3f lhs, const cfp_iter3f rhs);
+  zfp_bool (*eq)(const cfp_iter3f lhs, const cfp_iter3f rhs);
+  zfp_bool (*neq)(const cfp_iter3f lhs, const cfp_iter3f rhs);
+  ptrdiff_t (*distance)(const cfp_iter3f first, const cfp_iter3f last);
+  cfp_iter3f (*next)(const cfp_iter3f it, ptrdiff_t d);
+  cfp_iter3f (*prev)(const cfp_iter3f it, ptrdiff_t d);
+  cfp_iter3f (*inc)(const cfp_iter3f it);
+  cfp_iter3f (*dec)(const cfp_iter3f it);
+} cfp_iter3f_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array3f a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header3f_api;
+
+typedef struct {
+  cfp_array3f (*ctor_default)();
+  cfp_array3f (*ctor)(size_t nx, size_t ny, size_t nz, double rate, const float* p, size_t cache_size);
+  cfp_array3f (*ctor_copy)(const cfp_array3f src);
+  cfp_array3f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array3f self);
+
+  void (*deep_copy)(cfp_array3f self, const cfp_array3f src);
+
+  double (*rate)(const cfp_array3f self);
+  double (*set_rate)(cfp_array3f self, double rate);
+  size_t (*cache_size)(const cfp_array3f self);
+  void (*set_cache_size)(cfp_array3f self, size_t bytes);
+  void (*clear_cache)(const cfp_array3f self);
+  void (*flush_cache)(const cfp_array3f self);
+  size_t (*size_bytes)(const cfp_array3f self, uint mask);
+  size_t (*compressed_size)(const cfp_array3f self);
+  void* (*compressed_data)(const cfp_array3f self);
+  size_t (*size)(const cfp_array3f self);
+  size_t (*size_x)(const cfp_array3f self);
+  size_t (*size_y)(const cfp_array3f self);
+  size_t (*size_z)(const cfp_array3f self);
+  void (*resize)(cfp_array3f self, size_t nx, size_t ny, size_t nz, zfp_bool clear);
+
+  void (*get_array)(const cfp_array3f self, float* p);
+  void (*set_array)(cfp_array3f self, const float* p);
+  float (*get_flat)(const cfp_array3f self, size_t i);
+  void (*set_flat)(cfp_array3f self, size_t i, float val);
+  float (*get)(const cfp_array3f self, size_t i, size_t j, size_t k);
+  void (*set)(cfp_array3f self, size_t i, size_t j, size_t k, float val);
+
+  cfp_ref3f (*ref)(cfp_array3f self, size_t i, size_t j, size_t k);
+  cfp_ref3f (*ref_flat)(cfp_array3f self, size_t i);
+
+  cfp_ptr3f (*ptr)(cfp_array3f self, size_t i, size_t j, size_t k);
+  cfp_ptr3f (*ptr_flat)(cfp_array3f self, size_t i);
+
+  cfp_iter3f (*begin)(cfp_array3f self);
+  cfp_iter3f (*end)(cfp_array3f self);
+
+  cfp_ref3f_api reference;
+  cfp_ptr3f_api pointer;
+  cfp_iter3f_api iterator;
+  cfp_header3f_api header;
+} cfp_array3f_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array4d.h b/include/zfp/internal/cfp/array4d.h
new file mode 100644
index 00000000..22e6a88f
--- /dev/null
+++ b/include/zfp/internal/cfp/array4d.h
@@ -0,0 +1,148 @@
+#ifndef CFP_ARRAY_4D_H
+#define CFP_ARRAY_4D_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array4d;
+
+typedef struct {
+  cfp_array4d array;
+  size_t x, y, z, w;
+} cfp_ref4d;
+
+typedef struct {
+  cfp_ref4d reference;
+} cfp_ptr4d;
+
+typedef struct {
+  cfp_array4d array;
+  size_t x, y, z, w;
+} cfp_iter4d;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ref4d self);
+  void (*set)(cfp_ref4d self, double val);
+  cfp_ptr4d (*ptr)(cfp_ref4d self);
+  void (*copy)(cfp_ref4d self, const cfp_ref4d src);
+} cfp_ref4d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_ptr4d self);
+  double (*get_at)(const cfp_ptr4d self, ptrdiff_t d);
+  void (*set)(cfp_ptr4d self, double val);
+  void (*set_at)(cfp_ptr4d self, ptrdiff_t d, double val);
+  cfp_ref4d (*ref)(cfp_ptr4d self);
+  cfp_ref4d (*ref_at)(cfp_ptr4d self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr4d lhs, const cfp_ptr4d rhs);
+  zfp_bool (*gt)(const cfp_ptr4d lhs, const cfp_ptr4d rhs);
+  zfp_bool (*leq)(const cfp_ptr4d lhs, const cfp_ptr4d rhs);
+  zfp_bool (*geq)(const cfp_ptr4d lhs, const cfp_ptr4d rhs);
+  zfp_bool (*eq)(const cfp_ptr4d lhs, const cfp_ptr4d rhs);
+  zfp_bool (*neq)(const cfp_ptr4d lhs, const cfp_ptr4d rhs);
+  ptrdiff_t (*distance)(const cfp_ptr4d first, const cfp_ptr4d last);
+  cfp_ptr4d (*next)(const cfp_ptr4d p, ptrdiff_t d);
+  cfp_ptr4d (*prev)(const cfp_ptr4d p, ptrdiff_t d);
+  cfp_ptr4d (*inc)(const cfp_ptr4d p);
+  cfp_ptr4d (*dec)(const cfp_ptr4d p);
+} cfp_ptr4d_api;
+
+typedef struct {
+  /* member functions */
+  double (*get)(const cfp_iter4d self);
+  double (*get_at)(const cfp_iter4d self, ptrdiff_t d);
+  void (*set)(cfp_iter4d self, double val);
+  void (*set_at)(cfp_iter4d self, ptrdiff_t d, double val);
+  cfp_ref4d (*ref)(cfp_iter4d self);
+  cfp_ref4d (*ref_at)(cfp_iter4d self, ptrdiff_t d);
+  cfp_ptr4d (*ptr)(cfp_iter4d self);
+  cfp_ptr4d (*ptr_at)(cfp_iter4d self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter4d self);
+  size_t (*j)(const cfp_iter4d self);
+  size_t (*k)(const cfp_iter4d self);
+  size_t (*l)(const cfp_iter4d self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter4d lhs, const cfp_iter4d rhs);
+  zfp_bool (*gt)(const cfp_iter4d lhs, const cfp_iter4d rhs);
+  zfp_bool (*leq)(const cfp_iter4d lhs, const cfp_iter4d rhs);
+  zfp_bool (*geq)(const cfp_iter4d lhs, const cfp_iter4d rhs);
+  zfp_bool (*eq)(const cfp_iter4d lhs, const cfp_iter4d rhs);
+  zfp_bool (*neq)(const cfp_iter4d lhs, const cfp_iter4d rhs);
+  ptrdiff_t (*distance)(const cfp_iter4d first, const cfp_iter4d last);
+  cfp_iter4d (*next)(const cfp_iter4d it, ptrdiff_t d);
+  cfp_iter4d (*prev)(const cfp_iter4d it, ptrdiff_t d);
+  cfp_iter4d (*inc)(const cfp_iter4d it);
+  cfp_iter4d (*dec)(const cfp_iter4d it);
+} cfp_iter4d_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array4d a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header4d_api;
+
+typedef struct {
+  cfp_array4d (*ctor_default)();
+  cfp_array4d (*ctor)(size_t nx, size_t ny, size_t nz, size_t nw, double rate, const double* p, size_t cache_size);
+  cfp_array4d (*ctor_copy)(const cfp_array4d src);
+  cfp_array4d (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array4d self);
+
+  void (*deep_copy)(cfp_array4d self, const cfp_array4d src);
+
+  double (*rate)(const cfp_array4d self);
+  double (*set_rate)(cfp_array4d self, double rate);
+  size_t (*cache_size)(const cfp_array4d self);
+  void (*set_cache_size)(cfp_array4d self, size_t bytes);
+  void (*clear_cache)(const cfp_array4d self);
+  void (*flush_cache)(const cfp_array4d self);
+  size_t (*size_bytes)(const cfp_array4d self, uint mask);
+  size_t (*compressed_size)(const cfp_array4d self);
+  void* (*compressed_data)(const cfp_array4d self);
+  size_t (*size)(const cfp_array4d self);
+  size_t (*size_x)(const cfp_array4d self);
+  size_t (*size_y)(const cfp_array4d self);
+  size_t (*size_z)(const cfp_array4d self);
+  size_t (*size_w)(const cfp_array4d self);
+  void (*resize)(cfp_array4d self, size_t nx, size_t ny, size_t nz, size_t nw, zfp_bool clear);
+
+  void (*get_array)(const cfp_array4d self, double* p);
+  void (*set_array)(cfp_array4d self, const double* p);
+  double (*get_flat)(const cfp_array4d self, size_t i);
+  void (*set_flat)(cfp_array4d self, size_t i, double val);
+  double (*get)(const cfp_array4d self, size_t i, size_t j, size_t k, size_t l);
+  void (*set)(cfp_array4d self, size_t i, size_t j, size_t k, size_t l, double val);
+
+  cfp_ref4d (*ref)(cfp_array4d self, size_t i, size_t j, size_t k, size_t l);
+  cfp_ref4d (*ref_flat)(cfp_array4d self, size_t i);
+
+  cfp_ptr4d (*ptr)(cfp_array4d self, size_t i, size_t j, size_t k, size_t l);
+  cfp_ptr4d (*ptr_flat)(cfp_array4d self, size_t i);
+
+  cfp_iter4d (*begin)(cfp_array4d self);
+  cfp_iter4d (*end)(cfp_array4d self);
+
+  cfp_ref4d_api reference;
+  cfp_ptr4d_api pointer;
+  cfp_iter4d_api iterator;
+  cfp_header4d_api header;
+} cfp_array4d_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/array4f.h b/include/zfp/internal/cfp/array4f.h
new file mode 100644
index 00000000..b5e07674
--- /dev/null
+++ b/include/zfp/internal/cfp/array4f.h
@@ -0,0 +1,148 @@
+#ifndef CFP_ARRAY_4F_H
+#define CFP_ARRAY_4F_H
+
+#include <stddef.h>
+#include "zfp.h"
+
+typedef struct {
+  void* object;
+} cfp_array4f;
+
+typedef struct {
+  cfp_array4f array;
+  size_t x, y, z, w;
+} cfp_ref4f;
+
+typedef struct {
+  cfp_ref4f reference;
+} cfp_ptr4f;
+
+typedef struct {
+  cfp_array4f array;
+  size_t x, y, z, w;
+} cfp_iter4f;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ref4f self);
+  void (*set)(cfp_ref4f self, float val);
+  cfp_ptr4f (*ptr)(cfp_ref4f self);
+  void (*copy)(cfp_ref4f self, const cfp_ref4f src);
+} cfp_ref4f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_ptr4f self);
+  float (*get_at)(const cfp_ptr4f self, ptrdiff_t d);
+  void (*set)(cfp_ptr4f self, float val);
+  void (*set_at)(cfp_ptr4f self, ptrdiff_t d, float val);
+  cfp_ref4f (*ref)(cfp_ptr4f self);
+  cfp_ref4f (*ref_at)(cfp_ptr4f self, ptrdiff_t d);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_ptr4f lhs, const cfp_ptr4f rhs);
+  zfp_bool (*gt)(const cfp_ptr4f lhs, const cfp_ptr4f rhs);
+  zfp_bool (*leq)(const cfp_ptr4f lhs, const cfp_ptr4f rhs);
+  zfp_bool (*geq)(const cfp_ptr4f lhs, const cfp_ptr4f rhs);
+  zfp_bool (*eq)(const cfp_ptr4f lhs, const cfp_ptr4f rhs);
+  zfp_bool (*neq)(const cfp_ptr4f lhs, const cfp_ptr4f rhs);
+  ptrdiff_t (*distance)(const cfp_ptr4f first, const cfp_ptr4f last);
+  cfp_ptr4f (*next)(const cfp_ptr4f p, ptrdiff_t d);
+  cfp_ptr4f (*prev)(const cfp_ptr4f p, ptrdiff_t d);
+  cfp_ptr4f (*inc)(const cfp_ptr4f p);
+  cfp_ptr4f (*dec)(const cfp_ptr4f p);
+} cfp_ptr4f_api;
+
+typedef struct {
+  /* member functions */
+  float (*get)(const cfp_iter4f self);
+  float (*get_at)(const cfp_iter4f self, ptrdiff_t d);
+  void (*set)(cfp_iter4f self, float val);
+  void (*set_at)(cfp_iter4f self, ptrdiff_t d, float val);
+  cfp_ref4f (*ref)(cfp_iter4f self);
+  cfp_ref4f (*ref_at)(cfp_iter4f self, ptrdiff_t d);
+  cfp_ptr4f (*ptr)(cfp_iter4f self);
+  cfp_ptr4f (*ptr_at)(cfp_iter4f self, ptrdiff_t d);
+  size_t (*i)(const cfp_iter4f self);
+  size_t (*j)(const cfp_iter4f self);
+  size_t (*k)(const cfp_iter4f self);
+  size_t (*l)(const cfp_iter4f self);
+  /* non-member functions */
+  zfp_bool (*lt)(const cfp_iter4f lhs, const cfp_iter4f rhs);
+  zfp_bool (*gt)(const cfp_iter4f lhs, const cfp_iter4f rhs);
+  zfp_bool (*leq)(const cfp_iter4f lhs, const cfp_iter4f rhs);
+  zfp_bool (*geq)(const cfp_iter4f lhs, const cfp_iter4f rhs);
+  zfp_bool (*eq)(const cfp_iter4f lhs, const cfp_iter4f rhs);
+  zfp_bool (*neq)(const cfp_iter4f lhs, const cfp_iter4f rhs);
+  ptrdiff_t (*distance)(const cfp_iter4f first, const cfp_iter4f last);
+  cfp_iter4f (*next)(const cfp_iter4f it, ptrdiff_t d);
+  cfp_iter4f (*prev)(const cfp_iter4f it, ptrdiff_t d);
+  cfp_iter4f (*inc)(const cfp_iter4f it);
+  cfp_iter4f (*dec)(const cfp_iter4f it);
+} cfp_iter4f_api;
+
+typedef struct {
+  /* constructor/destructor */
+  cfp_header (*ctor)(const cfp_array4f a);
+  cfp_header (*ctor_buffer)(const void* data, size_t size);
+  void (*dtor)(cfp_header self);
+  /* array metadata */
+  zfp_type (*scalar_type)(const cfp_header self);
+  uint (*dimensionality)(const cfp_header self);
+  size_t (*size_x)(const cfp_header self);
+  size_t (*size_y)(const cfp_header self);
+  size_t (*size_z)(const cfp_header self);
+  size_t (*size_w)(const cfp_header self);
+  double (*rate)(const cfp_header self);
+  /* header payload: data pointer and byte size */
+  const void* (*data)(const cfp_header self);
+  size_t (*size_bytes)(const cfp_header self, uint mask);
+} cfp_header4f_api;
+
+typedef struct {
+  cfp_array4f (*ctor_default)();
+  cfp_array4f (*ctor)(size_t nx, size_t ny, size_t nz, size_t nw, double rate, const float* p, size_t cache_size);
+  cfp_array4f (*ctor_copy)(const cfp_array4f src);
+  cfp_array4f (*ctor_header)(const cfp_header h, const void* buffer, size_t buffer_size_bytes);
+  void (*dtor)(cfp_array4f self);
+
+  void (*deep_copy)(cfp_array4f self, const cfp_array4f src);
+
+  double (*rate)(const cfp_array4f self);
+  double (*set_rate)(cfp_array4f self, double rate);
+  size_t (*cache_size)(const cfp_array4f self);
+  void (*set_cache_size)(cfp_array4f self, size_t bytes);
+  void (*clear_cache)(const cfp_array4f self);
+  void (*flush_cache)(const cfp_array4f self);
+  size_t (*size_bytes)(const cfp_array4f self, uint mask);
+  size_t (*compressed_size)(const cfp_array4f self);
+  void* (*compressed_data)(const cfp_array4f self);
+  size_t (*size)(const cfp_array4f self);
+  size_t (*size_x)(const cfp_array4f self);
+  size_t (*size_y)(const cfp_array4f self);
+  size_t (*size_z)(const cfp_array4f self);
+  size_t (*size_w)(const cfp_array4f self);
+  void (*resize)(cfp_array4f self, size_t nx, size_t ny, size_t nz, size_t nw, zfp_bool clear);
+
+  void (*get_array)(const cfp_array4f self, float* p);
+  void (*set_array)(cfp_array4f self, const float* p);
+  float (*get_flat)(const cfp_array4f self, size_t i);
+  void (*set_flat)(cfp_array4f self, size_t i, float val);
+  float (*get)(const cfp_array4f self, size_t i, size_t j, size_t k, size_t l);
+  void (*set)(cfp_array4f self, size_t i, size_t j, size_t k, size_t l, float val);
+
+  cfp_ref4f (*ref)(cfp_array4f self, size_t i, size_t j, size_t k, size_t l);
+  cfp_ref4f (*ref_flat)(cfp_array4f self, size_t i);
+
+  cfp_ptr4f (*ptr)(cfp_array4f self, size_t i, size_t j, size_t k, size_t l);
+  cfp_ptr4f (*ptr_flat)(cfp_array4f self, size_t i);
+
+  cfp_iter4f (*begin)(cfp_array4f self);
+  cfp_iter4f (*end)(cfp_array4f self);
+
+  cfp_ref4f_api reference;
+  cfp_ptr4f_api pointer;
+  cfp_iter4f_api iterator;
+  cfp_header4f_api header;
+} cfp_array4f_api;
+
+#endif
diff --git a/include/zfp/internal/cfp/header.h b/include/zfp/internal/cfp/header.h
new file mode 100644
index 00000000..01d78ba5
--- /dev/null
+++ b/include/zfp/internal/cfp/header.h
@@ -0,0 +1,8 @@
+#ifndef CFP_HEADER_H
+#define CFP_HEADER_H
+
+typedef struct {
+  void* object;
+} cfp_header;
+
+#endif
diff --git a/include/zfp/internal/codec/genheader.hpp b/include/zfp/internal/codec/genheader.hpp
new file mode 100644
index 00000000..8beec088
--- /dev/null
+++ b/include/zfp/internal/codec/genheader.hpp
@@ -0,0 +1,76 @@
+// zfp::codec::generic_base::header
+class header : public zfp::array::header {
+public:
+  // serialization: construct header from array
+  header(const zfp::array& a) :
+    zfp::array::header(a),
+    bit_rate(static_cast<size_t>(a.rate()))
+  {
+    buffer[0] = magic;
+    buffer[1] = 0; // TODO: codec identifier (dimensionality, internal type)
+    buffer[2] = static_cast<uint64>(bit_rate);
+    buffer[3] = static_cast<uint64>(type);
+    buffer[4] = static_cast<uint64>(nx);
+    buffer[5] = static_cast<uint64>(ny);
+    buffer[6] = static_cast<uint64>(nz);
+    buffer[7] = static_cast<uint64>(nw);
+  }
+
+  // deserialization: construct header from memory buffer of optional size
+  header(const void* data, size_t bytes = 0) :
+    bit_rate(0)
+  {
+    // ensure byte size matches
+    if (bytes && bytes != byte_size)
+      throw zfp::exception("zfp generic header length does not match expectations");
+    else {
+      // copy and parse header
+      std::memcpy(buffer, data, byte_size);
+      if (buffer[0] != magic)
+        throw zfp::exception("zfp generic header is corrupt");
+      bit_rate = static_cast<size_t>(buffer[2]);
+      type = static_cast<zfp_type>(buffer[3]);
+      nx = static_cast<size_t>(buffer[4]);
+      ny = static_cast<size_t>(buffer[5]);
+      nz = static_cast<size_t>(buffer[6]);
+      nw = static_cast<size_t>(buffer[7]);
+    }
+  }
+
+  virtual ~header() {}
+
+  // rate in bits per value
+  double rate() const { return static_cast<double>(bit_rate); }
+
+  // header data
+  const void* data() const { return buffer; }
+
+  // header byte size
+  size_t size_bytes(uint mask = ZFP_DATA_HEADER) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - byte_size;
+    if (mask & ZFP_DATA_HEADER)
+      size += byte_size;
+    return size;
+  }
+
+protected:
+  // magic word
+  static const uint64 magic = UINT64C(0x000000008570667a);
+
+  // header size measured in bits, bytes, and 64-bit words
+  static const size_t word_size = 8;
+  static const size_t byte_size = word_size * sizeof(uint64);
+  static const size_t bit_size = byte_size * CHAR_BIT;
+
+  using zfp::array::header::type;
+  using zfp::array::header::nx;
+  using zfp::array::header::ny;
+  using zfp::array::header::nz;
+  using zfp::array::header::nw;
+
+  size_t bit_rate;          // array rate in bits per value
+  uint64 buffer[word_size]; // header data
+};
diff --git a/include/zfp/internal/codec/zfpheader.hpp b/include/zfp/internal/codec/zfpheader.hpp
new file mode 100644
index 00000000..6823e049
--- /dev/null
+++ b/include/zfp/internal/codec/zfpheader.hpp
@@ -0,0 +1,129 @@
+// zfp::codec::zfp_base::header
+class header : public zfp::array::header {
+public:
+  // serialization: construct header from array
+  header(const zfp::array& a) :
+    zfp::array::header(a),
+    bit_rate(a.rate())
+  {
+    std::string error;
+
+    // set up zfp stream and field for generating header
+    bitstream* stream = stream_open(buffer, sizeof(buffer));
+    zfp_stream* zfp = zfp_stream_open(stream);
+    bit_rate = zfp_stream_set_rate(zfp, bit_rate, type, dimensionality(), zfp_true);
+    if (zfp_stream_mode(zfp) > ZFP_MODE_SHORT_MAX)
+      error = "zfp serialization supports only short headers";
+    else {
+      // set up field
+      zfp_field* field = 0;
+      switch (dimensionality()) {
+        case 1:
+          field = zfp_field_1d(0, type, nx);
+          break;
+        case 2:
+          field = zfp_field_2d(0, type, nx, ny);
+          break;
+        case 3:
+          field = zfp_field_3d(0, type, nx, ny, nz);
+          break;
+        case 4:
+          field = zfp_field_4d(0, type, nx, ny, nz, nw);
+          break;
+        default:
+          error = "zfp serialization supports only 1D, 2D, 3D, and 4D arrays";
+          break;
+      }
+
+      if (field) {
+        // write header to buffer
+        size_t bits = zfp_write_header(zfp, field, ZFP_HEADER_FULL);
+        if (bits != bit_size)
+          error = "zfp header length does not match expected length";
+        zfp_stream_flush(zfp);
+        zfp_field_free(field);
+      }
+    }
+
+    zfp_stream_close(zfp);
+    stream_close(stream);
+
+    if (!error.empty())
+      throw zfp::exception(error);
+  }
+
+  // deserialization: construct header from memory buffer of optional size
+  header(const void* data, size_t bytes = 0) :
+    bit_rate(0)
+  {
+    std::string error;
+
+    // ensure byte size matches
+    if (bytes && bytes != byte_size)
+      error = "zfp header length does not match expectations";
+    else {
+      // copy and parse header
+      std::fill(buffer, buffer + word_size, 0);
+      std::memcpy(buffer, data, byte_size);
+      bitstream* stream = stream_open(buffer, sizeof(buffer));
+      zfp_stream* zfp = zfp_stream_open(stream);
+      zfp_field field;
+      size_t bits = zfp_read_header(zfp, &field, ZFP_HEADER_FULL);
+      if (!bits)
+        error = "zfp header is corrupt";
+      else if (bits != bit_size)
+        error = "zfp deserialization supports only short headers";
+      else if (zfp_stream_compression_mode(zfp) != zfp_mode_fixed_rate)
+        error = "zfp deserialization supports only fixed-rate mode";
+      else {
+        // success; initialize fields
+        type = field.type;
+        nx = field.nx;
+        ny = field.ny;
+        nz = field.nz;
+        nw = field.nw;
+        bit_rate = double(zfp->maxbits) / (1u << (2 * dimensionality()));
+      }
+      zfp_stream_close(zfp);
+      stream_close(stream);
+    }
+
+    // throw exception upon error
+    if (!error.empty())
+      throw zfp::exception(error);
+  }
+
+  virtual ~header() {}
+
+  // rate in bits per value
+  double rate() const { return bit_rate; }
+
+  // header data
+  const void* data() const { return buffer; }
+
+  // header byte size
+  size_t size_bytes(uint mask = ZFP_DATA_HEADER) const
+  {
+    size_t size = 0;
+    if (mask & ZFP_DATA_META)
+      size += sizeof(*this) - byte_size;
+    if (mask & ZFP_DATA_HEADER)
+      size += byte_size;
+    return size;
+  }
+
+protected:
+  // header size measured in bits, bytes, and 64-bit words
+  static const size_t bit_size = ZFP_MAGIC_BITS + ZFP_META_BITS + ZFP_MODE_SHORT_BITS;
+  static const size_t byte_size = (bit_size + CHAR_BIT - 1) / CHAR_BIT;
+  static const size_t word_size = (byte_size + sizeof(uint64) - 1) / sizeof(uint64);
+
+  using zfp::array::header::type;
+  using zfp::array::header::nx;
+  using zfp::array::header::ny;
+  using zfp::array::header::nz;
+  using zfp::array::header::nw;
+
+  double bit_rate;          // array rate in bits per value
+  uint64 buffer[word_size]; // header data
+};
diff --git a/src/inline/inline.h b/include/zfp/internal/zfp/inline.h
similarity index 77%
rename from src/inline/inline.h
rename to include/zfp/internal/zfp/inline.h
index e9ade3f1..bb10673b 100644
--- a/src/inline/inline.h
+++ b/include/zfp/internal/zfp/inline.h
@@ -1,5 +1,5 @@
-#ifndef INLINE_H
-#define INLINE_H
+#ifndef ZFP_INLINE_H
+#define ZFP_INLINE_H
 
 #ifndef inline_
   #if __STDC_VERSION__ >= 199901L
diff --git a/include/zfp/macros.h b/include/zfp/internal/zfp/macros.h
similarity index 100%
rename from include/zfp/macros.h
rename to include/zfp/internal/zfp/macros.h
diff --git a/include/zfp/system.h b/include/zfp/internal/zfp/system.h
similarity index 59%
rename from include/zfp/system.h
rename to include/zfp/internal/zfp/system.h
index 53941964..23c49360 100644
--- a/include/zfp/system.h
+++ b/include/zfp/internal/zfp/system.h
@@ -1,15 +1,16 @@
 #ifndef ZFP_SYSTEM_H
 #define ZFP_SYSTEM_H
 
-#if __STDC_VERSION__ >= 199901L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+  /* C99: use restrict */
   #define restrict_ restrict
 #else
+  /* C89: no restrict keyword */
   #define restrict_
 #endif
 
 /* macros for exporting and importing symbols */
-#ifdef _MSC_VER
-  #define export_ __declspec(dllexport)
+#if defined(_MSC_VER) && defined(ZFP_SHARED_LIBS)
   /* export (import) symbols when ZFP_SOURCE is (is not) defined */
   #ifdef ZFP_SOURCE
     #ifdef __cplusplus
@@ -24,8 +25,7 @@
       #define extern_ extern     __declspec(dllimport)
     #endif
   #endif
-#else /* !_MSC_VER */
-  #define export_
+#else /* !(_MSC_VER && ZFP_SHARED_LIBS) */
   #ifdef __cplusplus
     #define extern_ extern "C"
   #else
@@ -33,13 +33,13 @@
   #endif
 #endif
 
-#ifdef __GNUC__
-  /* L1 cache line size for alignment purposes */
-  #ifndef ZFP_CACHE_LINE_SIZE
-    #define ZFP_CACHE_LINE_SIZE 0x100
-  #endif
-  #define align_(n) __attribute__((aligned(n)))
-  #define cache_align_(x) x align_(ZFP_CACHE_LINE_SIZE)
+/* L1 cache line size for alignment purposes */
+#ifndef ZFP_CACHE_LINE_SIZE
+  #define ZFP_CACHE_LINE_SIZE 0x100
+#endif
+/* ZFP_CACHE_LINE_SIZE=0 disables alignment */
+#if defined(__GNUC__) && ZFP_CACHE_LINE_SIZE
+  #define cache_align_(x) x __attribute__((aligned(ZFP_CACHE_LINE_SIZE)))
 #else
   #define cache_align_(x) x
 #endif
diff --git a/include/zfp/types.h b/include/zfp/internal/zfp/types.h
similarity index 78%
rename from include/zfp/types.h
rename to include/zfp/internal/zfp/types.h
index f57e1f89..b209f378 100644
--- a/include/zfp/types.h
+++ b/include/zfp/internal/zfp/types.h
@@ -4,8 +4,33 @@
 typedef unsigned char uchar;
 typedef unsigned short ushort;
 typedef unsigned int uint;
+typedef unsigned long ulong;
 
-#if __STDC_VERSION__ >= 199901L
+#if defined(__cplusplus) && __cplusplus >= 201103L
+  /* C++11: use standard integer types */
+  #include <cstdint>
+  #include <cinttypes>
+  #define INT64C(x) INT64_C(x)
+  #define UINT64C(x) UINT64_C(x)
+  #define INT64PRId PRId64
+  #define INT64PRIi PRIi64
+  #define UINT64PRIo PRIo64
+  #define UINT64PRIu PRIu64
+  #define UINT64PRIx PRIx64
+  #define INT64SCNd SCNd64
+  #define INT64SCNi SCNi64
+  #define UINT64SCNo SCNo64
+  #define UINT64SCNu SCNu64
+  #define UINT64SCNx SCNx64
+  typedef std::int8_t int8;
+  typedef std::uint8_t uint8;
+  typedef std::int16_t int16;
+  typedef std::uint16_t uint16;
+  typedef std::int32_t int32;
+  typedef std::uint32_t uint32;
+  typedef std::int64_t int64;
+  typedef std::uint64_t uint64;
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
   /* C99: use standard integer types */
   #include <stdint.h>
   #include <inttypes.h>
diff --git a/include/zfp/version.h b/include/zfp/version.h
new file mode 100644
index 00000000..790927f3
--- /dev/null
+++ b/include/zfp/version.h
@@ -0,0 +1,49 @@
+#ifndef ZFP_VERSION_H
+#define ZFP_VERSION_H
+
+/* library version information */
+#define ZFP_VERSION_MAJOR 1   /* library major version number */
+#define ZFP_VERSION_MINOR 0   /* library minor version number */
+#define ZFP_VERSION_PATCH 0   /* library patch version number */
+#define ZFP_VERSION_TWEAK 0   /* library tweak version number */
+
+/* codec version number (see also zfp_codec_version) */
+#define ZFP_CODEC 5
+
+/* stringification */
+#define _zfp_str_(x) # x
+#define _zfp_str(x) _zfp_str_(x)
+
+/* macro for generating an integer version identifier */
+#define ZFP_MAKE_VERSION(major, minor, patch, tweak) \
+  (((major) << 12) + \
+   ((minor) << 8) + \
+   ((patch) << 4) + \
+   ((tweak) << 0))
+
+/* macros for generating a version string */
+#define ZFP_MAKE_VERSION_STRING(major, minor, patch) \
+  _zfp_str(major) "." \
+  _zfp_str(minor) "." \
+  _zfp_str(patch)
+
+#define ZFP_MAKE_FULLVERSION_STRING(major, minor, patch, tweak) \
+  _zfp_str(major) "." \
+  _zfp_str(minor) "." \
+  _zfp_str(patch) "." \
+  _zfp_str(tweak)
+
+/* library version number (see also zfp_library_version) */
+#define ZFP_VERSION \
+  ZFP_MAKE_VERSION(ZFP_VERSION_MAJOR, ZFP_VERSION_MINOR, ZFP_VERSION_PATCH, ZFP_VERSION_TWEAK)
+
+/* library version string (see also zfp_version_string) */
+#if ZFP_VERSION_TWEAK == 0
+  #define ZFP_VERSION_STRING \
+    ZFP_MAKE_VERSION_STRING(ZFP_VERSION_MAJOR, ZFP_VERSION_MINOR, ZFP_VERSION_PATCH)
+#else
+  #define ZFP_VERSION_STRING \
+    ZFP_MAKE_FULLVERSION_STRING(ZFP_VERSION_MAJOR, ZFP_VERSION_MINOR, ZFP_VERSION_PATCH, ZFP_VERSION_TWEAK)
+#endif
+
+#endif
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 905dc97e..9c06d453 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/scikit-build-cmake)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/eyescale-cmake)
 include(UseCython)
 include(FindPythonExtensions)
 include(FindNumPy)
@@ -11,16 +10,17 @@ find_package(Cython REQUIRED)
 find_package(NumPy REQUIRED)
 
 include_directories(${ZFP_SOURCE_DIR}/include)
-include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
+include_directories(${NumPy_INCLUDE_DIR})
 
 add_cython_target(zfpy zfpy.pyx C)
 add_library(zfpy MODULE ${zfpy})
 target_link_libraries(zfpy zfp)
 python_extension_module(zfpy)
 
-# Build to the currrent binary dir to avoid conflicts with other libraries named zfp
-set(PYLIB_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/lib" CACHE PATH "Directory where zfp python library will be built")
+# Build to the current binary dir to avoid conflicts with other libraries named zfp
+set(PYLIB_BUILD_DIR "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Directory where zfp python library will be built")
 set_target_properties(zfpy PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PYLIB_BUILD_DIR})
+
 # Install to the typical python module directory
 set(python_install_lib_dir "lib/python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}/site-packages/")
 install(TARGETS zfpy LIBRARY DESTINATION ${python_install_lib_dir})
diff --git a/python/eyescale-cmake/FindNumPy.cmake b/python/eyescale-cmake/FindNumPy.cmake
deleted file mode 100644
index 8aba4e69..00000000
--- a/python/eyescale-cmake/FindNumPy.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-# Find the Python NumPy package
-# PYTHON_NUMPY_INCLUDE_DIR
-# PYTHON_NUMPY_FOUND
-# will be set by this script
-
-# cmake_minimum_required(VERSION 2.6)
-
-if(NOT PYTHON_EXECUTABLE)
-  if(NumPy_FIND_QUIETLY)
-    find_package(PythonInterp QUIET)
-  else()
-    find_package(PythonInterp)
-    set(__numpy_out 1)
-  endif()
-endif()
-
-if (PYTHON_EXECUTABLE)
-  # Find out the include path
-  execute_process(
-    COMMAND "${PYTHON_EXECUTABLE}" -c
-            "from __future__ import print_function\ntry: import numpy; print(numpy.get_include(), end='')\nexcept:pass\n"
-            OUTPUT_VARIABLE __numpy_path)
-  # And the version
-  execute_process(
-    COMMAND "${PYTHON_EXECUTABLE}" -c
-            "from __future__ import print_function\ntry: import numpy; print(numpy.__version__, end='')\nexcept:pass\n"
-    OUTPUT_VARIABLE __numpy_version)
-elseif(__numpy_out)
-  message(STATUS "Python executable not found.")
-endif(PYTHON_EXECUTABLE)
-
-find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
-  HINTS "${__numpy_path}" "${PYTHON_INCLUDE_PATH}" NO_DEFAULT_PATH)
-
-if(PYTHON_NUMPY_INCLUDE_DIR)
-  set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found")
-endif(PYTHON_NUMPY_INCLUDE_DIR)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy REQUIRED_VARS PYTHON_NUMPY_INCLUDE_DIR
-                                        VERSION_VAR __numpy_version)
diff --git a/python/eyescale-cmake/LICENSE.txt b/python/eyescale-cmake/LICENSE.txt
deleted file mode 100644
index 307d54e5..00000000
--- a/python/eyescale-cmake/LICENSE.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Unless otherwise noted in the file, all files in this directory are
-licensed under the BSD license, reproduced below.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-- Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-- Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-- Neither the name of Eyescale Software GmbH nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/scikit-build-cmake/FindCython.cmake b/python/scikit-build-cmake/FindCython.cmake
index 3d58c4f0..5f2ce6e0 100644
--- a/python/scikit-build-cmake/FindCython.cmake
+++ b/python/scikit-build-cmake/FindCython.cmake
@@ -13,7 +13,7 @@
 #  ``CYTHON_FOUND``
 #    true if the program was found
 #
-# For more information on the Cython project, see http://cython.org/.
+# For more information on the Cython project, see https://cython.org/.
 #
 # *Cython is a language that makes writing C extensions for the Python language
 # as easy as Python itself.*
@@ -56,7 +56,8 @@ if(CYTHON_EXECUTABLE)
                   OUTPUT_VARIABLE CYTHON_version_output
                   ERROR_VARIABLE CYTHON_version_error
                   RESULT_VARIABLE CYTHON_version_result
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+                  OUTPUT_STRIP_TRAILING_WHITESPACE
+                  ERROR_STRIP_TRAILING_WHITESPACE)
 
   if(NOT ${CYTHON_version_result} EQUAL 0)
     set(_error_msg "Command \"${CYTHON_version_command}\" failed with")
@@ -65,6 +66,10 @@ if(CYTHON_EXECUTABLE)
   else()
     if("${CYTHON_version_output}" MATCHES "^[Cc]ython version ([^,]+)")
       set(CYTHON_VERSION "${CMAKE_MATCH_1}")
+    else()
+      if("${CYTHON_version_error}" MATCHES "^[Cc]ython version ([^,]+)")
+        set(CYTHON_VERSION "${CMAKE_MATCH_1}")
+      endif()
     endif()
   endif()
 endif()
diff --git a/python/scikit-build-cmake/FindNumPy.cmake b/python/scikit-build-cmake/FindNumPy.cmake
new file mode 100644
index 00000000..cd78112b
--- /dev/null
+++ b/python/scikit-build-cmake/FindNumPy.cmake
@@ -0,0 +1,106 @@
+#.rst:
+#
+# Find the include directory for ``numpy/arrayobject.h`` as well as other NumPy tools like ``conv-template`` and
+# ``from-template``.
+#
+# This module sets the following variables:
+#
+# ``NumPy_FOUND``
+#   True if NumPy was found.
+# ``NumPy_INCLUDE_DIRS``
+#   The include directories needed to use NumpPy.
+# ``NumPy_VERSION``
+#   The version of NumPy found.
+# ``NumPy_CONV_TEMPLATE_EXECUTABLE``
+#   Path to conv-template executable.
+# ``NumPy_FROM_TEMPLATE_EXECUTABLE``
+#   Path to from-template executable.
+#
+# The module will also explicitly define one cache variable:
+#
+# ``NumPy_INCLUDE_DIR``
+#
+# .. note::
+#
+#     To support NumPy < v0.15.0 where ``from-template`` and ``conv-template`` are not declared as entry points,
+#     the module emulates the behavior of standalone executables by setting the corresponding variables with the
+#     path the the python interpreter and the path to the associated script. For example:
+#     ::
+#
+#         set(NumPy_CONV_TEMPLATE_EXECUTABLE /path/to/python /path/to/site-packages/numpy/distutils/conv_template.py CACHE STRING "Command executing conv-template program" FORCE)
+#
+#         set(NumPy_FROM_TEMPLATE_EXECUTABLE /path/to/python /path/to/site-packages/numpy/distutils/from_template.py CACHE STRING "Command executing from-template program" FORCE)
+#
+
+if(NOT NumPy_FOUND)
+  set(_find_extra_args)
+  if(NumPy_FIND_REQUIRED)
+    list(APPEND _find_extra_args REQUIRED)
+  endif()
+  if(NumPy_FIND_QUIET)
+    list(APPEND _find_extra_args QUIET)
+  endif()
+  find_package(PythonInterp ${_find_extra_args})
+  find_package(PythonLibs ${_find_extra_args})
+
+  find_program(NumPy_CONV_TEMPLATE_EXECUTABLE NAMES conv-template)
+  find_program(NumPy_FROM_TEMPLATE_EXECUTABLE NAMES from-template)
+
+  if(PYTHON_EXECUTABLE)
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+      -c "import numpy; print(numpy.get_include())"
+      OUTPUT_VARIABLE _numpy_include_dir
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_QUIET
+      )
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+      -c "import numpy; print(numpy.__version__)"
+      OUTPUT_VARIABLE NumPy_VERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_QUIET
+      )
+
+    # XXX This is required to support NumPy < v0.15.0. See note in module documentation above.
+    if(NOT NumPy_CONV_TEMPLATE_EXECUTABLE)
+      execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+        -c "from numpy.distutils import conv_template; print(conv_template.__file__)"
+        OUTPUT_VARIABLE _numpy_conv_template_file
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+        )
+      set(NumPy_CONV_TEMPLATE_EXECUTABLE "${PYTHON_EXECUTABLE}" "${_numpy_conv_template_file}" CACHE STRING "Command executing conv-template program" FORCE)
+    endif()
+
+    # XXX This is required to support NumPy < v0.15.0. See note in module documentation above.
+    if(NOT NumPy_FROM_TEMPLATE_EXECUTABLE)
+      execute_process(COMMAND "${PYTHON_EXECUTABLE}"
+        -c "from numpy.distutils import from_template; print(from_template.__file__)"
+        OUTPUT_VARIABLE _numpy_from_template_file
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+        )
+      set(NumPy_FROM_TEMPLATE_EXECUTABLE "${PYTHON_EXECUTABLE}" "${_numpy_from_template_file}" CACHE STRING "Command executing from-template program" FORCE)
+    endif()
+  endif()
+endif()
+
+find_path(NumPy_INCLUDE_DIR
+  numpy/arrayobject.h
+  PATHS "${_numpy_include_dir}" "${PYTHON_INCLUDE_DIR}"
+  PATH_SUFFIXES numpy/core/include
+  )
+
+set(NumPy_INCLUDE_DIRS ${NumPy_INCLUDE_DIR})
+
+# handle the QUIETLY and REQUIRED arguments and set NumPy_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NumPy
+                                  REQUIRED_VARS
+                                    NumPy_INCLUDE_DIR
+                                    NumPy_CONV_TEMPLATE_EXECUTABLE
+                                    NumPy_FROM_TEMPLATE_EXECUTABLE
+                                  VERSION_VAR NumPy_VERSION
+                                  )
+
+mark_as_advanced(NumPy_INCLUDE_DIR)
diff --git a/python/scikit-build-cmake/FindPythonExtensions.cmake b/python/scikit-build-cmake/FindPythonExtensions.cmake
index 9a3d76a0..33e034d0 100644
--- a/python/scikit-build-cmake/FindPythonExtensions.cmake
+++ b/python/scikit-build-cmake/FindPythonExtensions.cmake
@@ -104,9 +104,10 @@
 #                         [HEADER_OUTPUT_VAR <HeaderOutputVar>]
 #                         [INCLUDE_DIR_OUTPUT_VAR <IncludeDirOutputVar>])
 #
+# without the extension is used as the logical name.  If only ``<Name>`` is
+#
 # If only ``<Name>`` is provided, and it ends in the ".h" extension, then it
 # is assumed to be the ``<HeaderFilename>``.  The filename of the header file
-# without the extension is used as the logical name.  If only ``<Name>`` is
 # provided, and it does not end in the ".h" extension, then the
 # ``<HeaderFilename>`` is assumed to ``<Name>.h``.
 #
@@ -200,7 +201,7 @@
 #                            FORWARD_DECL_MODULES_VAR fdecl_module_list)
 #
 #    # module2 -- dynamically linked
-#    include_directories({Boost_INCLUDE_DIRS})
+#    include_directories(${Boost_INCLUDE_DIRS})
 #    add_library(module2 SHARED boost_module2.cxx)
 #    target_link_libraries(module2 ${Boost_LIBRARIES})
 #    python_extension_module(module2
@@ -209,7 +210,7 @@
 #
 #    # module3 -- loaded at runtime
 #    add_cython_target(module3a.pyx)
-#    add_library(module1 MODULE ${module3a} module3b.cxx)
+#    add_library(module3 MODULE ${module3a} module3b.cxx)
 #    target_link_libraries(module3 ${Boost_LIBRARIES})
 #    python_extension_module(module3
 #                            LINKED_MODULES_VAR linked_module_list
@@ -254,7 +255,6 @@ import os
 import os.path
 import site
 import sys
-import sysconfig
 
 result = None
 rel_result = None
@@ -282,13 +282,17 @@ for candidate in candidates:
         rel_result = rel_candidate
         break
 
+ext_suffix_var = 'SO'
+if sys.version_info[:2] >= (3, 5):
+    ext_suffix_var = 'EXT_SUFFIX'
+
 sys.stdout.write(\";\".join((
     os.sep,
     os.pathsep,
     sys.prefix,
     result,
     rel_result,
-    sysconfig.get_config_var('SO')
+    distutils.sysconfig.get_config_var(ext_suffix_var)
 )))
 ")
 
@@ -327,12 +331,11 @@ function(_set_python_extension_symbol_visibility _target)
   else()
     set(_modinit_prefix "init")
   endif()
-  message("_modinit_prefix:${_modinit_prefix}")
   if("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
     set_target_properties(${_target} PROPERTIES LINK_FLAGS
         "/EXPORT:${_modinit_prefix}${_target}"
     )
-  elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+  elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     set(_script_path
       ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}-version-script.map
     )
@@ -340,7 +343,7 @@ function(_set_python_extension_symbol_visibility _target)
                "{global: ${_modinit_prefix}${_target}; local: *; };"
     )
     set_property(TARGET ${_target} APPEND_STRING PROPERTY LINK_FLAGS
-        " -Wl,--version-script=${_script_path}"
+        " -Wl,--version-script=\"${_script_path}\""
     )
   endif()
 endfunction()
@@ -423,7 +426,7 @@ function(python_extension_module _target)
     target_link_libraries_with_dynamic_lookup(${_target} ${PYTHON_LIBRARIES})
 
     if(_is_module_lib)
-      #_set_python_extension_symbol_visibility(${_altname})
+      _set_python_extension_symbol_visibility(${_target})
     endif()
   endif()
 endfunction()
diff --git a/python/scikit-build-cmake/LICENSE b/python/scikit-build-cmake/LICENSE
index 73a9db0f..3a85dcff 100644
--- a/python/scikit-build-cmake/LICENSE
+++ b/python/scikit-build-cmake/LICENSE
@@ -1,6 +1,3 @@
-Unless otherwise noted in the file, all files in this directory are
-licensed under the MIT license, reproduced below.
-
 The MIT License (MIT)
 
 Copyright (c) 2014 Mike Sarahan
diff --git a/python/scikit-build-cmake/UseCython.cmake b/python/scikit-build-cmake/UseCython.cmake
index 9a596648..2c40bd7b 100644
--- a/python/scikit-build-cmake/UseCython.cmake
+++ b/python/scikit-build-cmake/UseCython.cmake
@@ -43,7 +43,7 @@
 # ``PY2 | PY3``
 #   Force compilation using either Python-2 or Python-3 syntax and code
 #   semantics.  By default, Python-2 syntax and semantics are used if the major
-#   version of Python found is 2.  Otherwise, Python-3 syntax and sematics are
+#   version of Python found is 2.  Otherwise, Python-3 syntax and semantics are
 #   used.
 #
 # ``OUTPUT_VAR <OutputVar>``
@@ -56,13 +56,13 @@
 # ``<OutputVar>``
 #   The path of the generated source file.
 #
-# Cache variables that effect the behavior include:
+# Cache variables that affect the behavior include:
 #
 # ``CYTHON_ANNOTATE``
-#   whether to create an annotated .html file when compiling
+#   Whether to create an annotated .html file when compiling.
 #
 # ``CYTHON_FLAGS``
-#   additional flags to pass to the Cython compiler
+#   Additional flags to pass to the Cython compiler.
 #
 # Example usage
 # ^^^^^^^^^^^^^
@@ -101,7 +101,6 @@ set(CYTHON_ANNOTATE OFF
 set(CYTHON_FLAGS "" CACHE STRING
     "Extra flags to the cython compiler.")
 mark_as_advanced(CYTHON_ANNOTATE CYTHON_FLAGS)
-string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}")
 
 find_package(PythonLibs REQUIRED)
 
@@ -138,6 +137,14 @@ function(add_cython_target _name)
 
   set(_embed_main FALSE)
 
+  if("C" IN_LIST languages)
+    set(_output_syntax "C")
+  elseif("CXX" IN_LIST languages)
+    set(_output_syntax "CXX")
+  else()
+    message(FATAL_ERROR "Either C or CXX must be enabled to use Cython")
+  endif()
+
   if("${PYTHONLIBS_VERSION_STRING}" MATCHES "^2.")
     set(_input_syntax "PY2")
   else()
@@ -323,21 +330,11 @@ function(add_cython_target _name)
     set(annotate_arg "--annotate")
   endif()
 
-  set(no_docstrings_arg "")
-  set(embed_signature_arg "")
-  if(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
-    set(no_docstrings_arg "--no-docstrings")
-  else()
-    set(embed_signature_arg "-Xembedsignature=True")
-  endif()
-
   set(cython_debug_arg "")
-  set(embed_pos_arg "")
   set(line_directives_arg "")
   if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR
      CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
     set(cython_debug_arg "--gdb")
-    set(embed_pos_arg "--embed-positions")
     set(line_directives_arg "--line-directives")
   endif()
 
@@ -352,12 +349,13 @@ function(add_cython_target _name)
   list(REMOVE_DUPLICATES pxd_dependencies)
   list(REMOVE_DUPLICATES c_header_dependencies)
 
+  string(REGEX REPLACE " " ";" CYTHON_FLAGS_LIST "${CYTHON_FLAGS}")
+
   # Add the command to run the compiler.
   add_custom_command(OUTPUT ${generated_file}
                      COMMAND ${CYTHON_EXECUTABLE}
                      ARGS ${cxx_arg} ${include_directory_arg} ${py_version_arg}
-                          ${embed_arg} ${annotate_arg} ${no_docstrings_arg}
-                          ${cython_debug_arg} ${embed_pos_arg} ${embed_signature_arg}
+                          ${embed_arg} ${annotate_arg} ${cython_debug_arg}
                           ${line_directives_arg} ${CYTHON_FLAGS_LIST} ${pyx_location}
                           --output-file ${generated_file}
                      DEPENDS ${_source_file}
diff --git a/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake b/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake
index 020fc404..6199ed5e 100644
--- a/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake
+++ b/python/scikit-build-cmake/targetLinkLibrariesWithDynamicLookup.cmake
@@ -362,7 +362,7 @@ function(_test_weak_link_project
       file(APPEND "${test_project_src_dir}/main.c" "
         goto done;
         error:
-          fprintf(stderr, \"Error occured:\\n    %s\\n\", dlerror());
+          fprintf(stderr, \"Error occurred:\\n    %s\\n\", dlerror());
           result = 1;
 
         done:
diff --git a/python/zfpy.pxd b/python/zfpy.pxd
index f812aed6..4d89776b 100644
--- a/python/zfpy.pxd
+++ b/python/zfpy.pxd
@@ -1,11 +1,11 @@
 import cython
 cimport libc.stdint as stdint
 
-cdef extern from "bitstream.h":
+cdef extern from "zfp/bitstream.h":
     cdef struct bitstream:
         pass
-    bitstream* stream_open(void* data, size_t);
-    void stream_close(bitstream* stream);
+    bitstream* stream_open(void* data, size_t)
+    void stream_close(bitstream* stream)
 
 cdef extern from "zfp.h":
     # enums
@@ -21,51 +21,54 @@ cdef extern from "zfp.h":
         zfp_mode_expert          = 1,
         zfp_mode_fixed_rate      = 2,
         zfp_mode_fixed_precision = 3,
-        zfp_mode_fixed_accuracy  = 4
+        zfp_mode_fixed_accuracy  = 4,
+        zfp_mode_reversible      = 5
 
     # structs
     ctypedef struct zfp_field:
         zfp_type _type "type"
-        cython.uint nx, ny, nz, nw
-        int sx, sy, sz, sw
+        size_t nx, ny, nz, nw
+        ptrdiff_t sx, sy, sz, sw
         void* data
     ctypedef struct zfp_stream:
         pass
 
+    ctypedef int zfp_bool
+
     # include #define's
     cython.uint ZFP_HEADER_MAGIC
     cython.uint ZFP_HEADER_META
     cython.uint ZFP_HEADER_MODE
     cython.uint ZFP_HEADER_FULL
 
-    # function definitions
-    zfp_stream* zfp_stream_open(bitstream* stream);
-    void zfp_stream_close(zfp_stream* stream);
-    size_t zfp_stream_maximum_size(const zfp_stream* stream, const zfp_field* field);
-    void zfp_stream_set_bit_stream(zfp_stream* stream, bitstream* bs);
-    cython.uint zfp_stream_set_precision(zfp_stream* stream, cython.uint precision);
-    double zfp_stream_set_accuracy(zfp_stream* stream, double tolerance);
-    double zfp_stream_set_rate(zfp_stream* stream, double rate, zfp_type type, cython.uint dims, int wra);
-    void zfp_stream_set_reversible(zfp_stream* stream);
-    stdint.uint64_t zfp_stream_mode(const zfp_stream* zfp);
-    zfp_mode zfp_stream_set_mode(zfp_stream* stream, stdint.uint64_t mode);
-    zfp_field* zfp_field_alloc();
-    zfp_field* zfp_field_1d(void* pointer, zfp_type, cython.uint nx);
-    zfp_field* zfp_field_2d(void* pointer, zfp_type, cython.uint nx, cython.uint ny);
-    zfp_field* zfp_field_3d(void* pointer, zfp_type, cython.uint nx, cython.uint ny, cython.uint nz);
-    zfp_field* zfp_field_4d(void* pointer, zfp_type, cython.uint nx, cython.uint ny, cython.uint nz, cython.uint nw);
-    void zfp_field_set_stride_1d(zfp_field* field, int sx);
-    void zfp_field_set_stride_2d(zfp_field* field, int sx, int sy);
-    void zfp_field_set_stride_3d(zfp_field* field, int sx, int sy, int sz);
-    void zfp_field_set_stride_4d(zfp_field* field, int sx, int sy, int sz, int sw);
-    int zfp_field_stride(const zfp_field* field, int* stride)
-    void zfp_field_free(zfp_field* field);
-    zfp_type zfp_field_set_type(zfp_field* field, zfp_type type);
-    size_t zfp_compress(zfp_stream* stream, const zfp_field* field) nogil;
-    size_t zfp_decompress(zfp_stream* stream, zfp_field* field) nogil;
-    size_t zfp_write_header(zfp_stream* stream, const zfp_field* field, cython.uint mask);
-    size_t zfp_read_header(zfp_stream* stream, zfp_field* field, cython.uint mask);
-    void zfp_stream_rewind(zfp_stream* stream);
-    void zfp_field_set_pointer(zfp_field* field, void* pointer) nogil;
+    # function declarations
+    zfp_stream* zfp_stream_open(bitstream* stream)
+    void zfp_stream_close(zfp_stream* stream)
+    stdint.uint64_t zfp_stream_mode(const zfp_stream* zfp)
+    size_t zfp_stream_maximum_size(const zfp_stream* stream, const zfp_field* field)
+    void zfp_stream_rewind(zfp_stream* stream)
+    void zfp_stream_set_bit_stream(zfp_stream* stream, bitstream* bs)
+    void zfp_stream_set_reversible(zfp_stream* stream)
+    double zfp_stream_set_rate(zfp_stream* stream, double rate, zfp_type type, cython.uint dims, zfp_bool align)
+    cython.uint zfp_stream_set_precision(zfp_stream* stream, cython.uint precision)
+    double zfp_stream_set_accuracy(zfp_stream* stream, double tolerance)
+    zfp_mode zfp_stream_set_mode(zfp_stream* stream, stdint.uint64_t mode)
+    zfp_field* zfp_field_alloc()
+    zfp_field* zfp_field_1d(void* pointer, zfp_type, size_t nx)
+    zfp_field* zfp_field_2d(void* pointer, zfp_type, size_t nx, size_t ny)
+    zfp_field* zfp_field_3d(void* pointer, zfp_type, size_t nx, size_t ny, size_t nz)
+    zfp_field* zfp_field_4d(void* pointer, zfp_type, size_t nx, size_t ny, size_t nz, size_t nw)
+    void zfp_field_free(zfp_field* field)
+    zfp_bool zfp_field_stride(const zfp_field* field, ptrdiff_t* stride)
+    void zfp_field_set_pointer(zfp_field* field, void* pointer) nogil
+    zfp_type zfp_field_set_type(zfp_field* field, zfp_type type)
+    void zfp_field_set_stride_1d(zfp_field* field, ptrdiff_t sx)
+    void zfp_field_set_stride_2d(zfp_field* field, ptrdiff_t sx, ptrdiff_t sy)
+    void zfp_field_set_stride_3d(zfp_field* field, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
+    void zfp_field_set_stride_4d(zfp_field* field, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
+    size_t zfp_compress(zfp_stream* stream, const zfp_field* field) nogil
+    size_t zfp_decompress(zfp_stream* stream, zfp_field* field) nogil
+    size_t zfp_write_header(zfp_stream* stream, const zfp_field* field, cython.uint mask)
+    size_t zfp_read_header(zfp_stream* stream, zfp_field* field, cython.uint mask)
 
 cdef gen_padded_int_list(orig_array, pad=*, length=*)
diff --git a/python/zfpy.pyx b/python/zfpy.pyx
index 1f38697f..6ac9e11c 100644
--- a/python/zfpy.pyx
+++ b/python/zfpy.pyx
@@ -4,8 +4,7 @@ import functools
 import cython
 from libc.stdlib cimport malloc, free
 from cython cimport view
-from cpython cimport array
-import array
+from libc.stdint cimport uint8_t
 
 import itertools
 if sys.version_info[0] == 2:
@@ -51,11 +50,11 @@ cpdef dtype_to_ztype(dtype):
 
 cpdef dtype_to_format(dtype):
     # format characters detailed here:
-    # https://docs.python.org/2/library/array.html#module-array
+    # https://docs.python.org/3/library/array.html
     if dtype == np.int32:
         return 'i' # signed int
     elif dtype == np.int64:
-        return 'l' # signed long
+        return 'q' # signed long long
     elif dtype == np.float32:
         return 'f' # float
     elif dtype == np.float64:
@@ -75,7 +74,7 @@ cpdef ztype_to_dtype(zfp_type ztype):
     except KeyError:
         raise ValueError("Unsupported zfp_type {}".format(ztype))
 
-cdef zfp_field* _init_field(np.ndarray arr):
+cdef zfp_field* _init_field(np.ndarray arr) except NULL:
     shape = arr.shape
     cdef int ndim = arr.ndim
     cdef zfp_type ztype = dtype_to_ztype(arr.dtype)
@@ -142,8 +141,8 @@ cpdef bytes compress_numpy(
     cdef zfp_field* field = _init_field(arr)
     cdef zfp_stream* stream = zfp_stream_open(NULL)
 
-    cdef zfp_type ztype = zfp_type_none;
-    cdef int ndim = arr.ndim;
+    cdef zfp_type ztype = zfp_type_none
+    cdef int ndim = arr.ndim
     _set_compression_mode(stream, ztype, ndim, tolerance, rate, precision)
 
     # Allocate space based on the maximum size potentially required by zfp to
@@ -245,7 +244,7 @@ cdef _validate_4d_list(in_list, list_name):
         )
 
 cpdef np.ndarray _decompress(
-    bytes compressed_data,
+    const uint8_t[::1] compressed_data,
     zfp_type ztype,
     shape,
     out=None,
@@ -253,17 +252,16 @@ cpdef np.ndarray _decompress(
     double rate = -1,
     int precision = -1,
 ):
-
     if compressed_data is None:
         raise TypeError("compressed_data cannot be None")
     if compressed_data is out:
         raise ValueError("Cannot decompress in-place")
     _validate_4d_list(shape, "shape")
 
-    cdef char* comp_data_pointer = compressed_data
+    cdef const void* comp_data_pointer = <const void*>&compressed_data[0]
     cdef zfp_field* field = zfp_field_alloc()
     cdef bitstream* bstream = stream_open(
-        comp_data_pointer,
+        <void *>comp_data_pointer,
         len(compressed_data)
     )
     cdef zfp_stream* stream = zfp_stream_open(bstream)
@@ -329,15 +327,15 @@ cpdef np.ndarray _decompress(
     return output
 
 cpdef np.ndarray decompress_numpy(
-    bytes compressed_data,
+    const uint8_t[::1] compressed_data,
 ):
     if compressed_data is None:
         raise TypeError("compressed_data cannot be None")
 
-    cdef char* comp_data_pointer = compressed_data
+    cdef const void* comp_data_pointer = <const void *>&compressed_data[0]
     cdef zfp_field* field = zfp_field_alloc()
     cdef bitstream* bstream = stream_open(
-        comp_data_pointer,
+        <void *>comp_data_pointer,
         len(compressed_data)
     )
     cdef zfp_stream* stream = zfp_stream_open(bstream)
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..9e4c3911
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,15 @@
+from setuptools import setup, Extension
+import numpy as np
+
+setup(
+    name="zfpy",
+    version="1.0.0",
+    author="Peter Lindstrom",
+    author_email="zfp@llnl.gov",
+    url="https://zfp.llnl.gov",
+    description="zfp compression in Python",
+    long_description="zfp is a compressed format for representing multidimensional floating-point and integer arrays. zfp provides compressed-array classes that support high throughput read and write random access to individual array elements. zfp also supports serial and parallel compression of whole arrays using both lossless and lossy compression with error tolerances. zfp is primarily written in C and C++ but also includes Python and Fortran bindings.",
+    ext_modules=[Extension("zfpy", ["build/python/zfpy.c"],
+                           include_dirs=["include", np.get_include()],
+                           libraries=["zfp"], library_dirs=["build/lib64", "build/lib/Release"])]
+)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 46887588..fd5702e5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,11 +1,11 @@
 if(ZFP_WITH_CUDA)
-  SET(CMAKE_CXX_FLAGS_PREVIOUS ${CMAKE_CXX_FLAGS})
-  SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fPIC" )
+  set(CMAKE_CXX_FLAGS_PREVIOUS ${CMAKE_CXX_FLAGS})
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fPIC" )
 
   add_subdirectory(cuda_zfp)
   cuda_include_directories(${PROJECT_SOURCE_DIR}/include)
-  cuda_wrap_srcs(zfp OBJ zfp_cuda_backend_obj cuda_zfp/cuZFP.cu)
-  SET(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_PREVIOUS})
+  cuda_wrap_srcs(zfp OBJ zfp_cuda_backend_obj cuda_zfp/cuZFP.cu OPTIONS ${CMAKE_CUDA_FLAGS})
+  set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_PREVIOUS})
   add_definitions(-DZFP_WITH_CUDA)
 endif()
 
@@ -28,16 +28,16 @@ add_library(zfp ${zfp_source}
 add_library(zfp::zfp ALIAS zfp)
 
 if(ZFP_WITH_OPENMP)
-  target_compile_options(zfp PRIVATE ${OpenMP_C_FLAGS})
-  target_link_libraries(zfp PRIVATE ${OpenMP_C_LIBRARIES})
+  target_link_libraries(zfp PRIVATE OpenMP::OpenMP_C)
 endif()
 
 if(HAVE_LIBM_MATH)
   target_link_libraries(zfp PRIVATE m)
 endif()
 
-if(WIN32)
+if(WIN32 AND BUILD_SHARED_LIBS)
   # Define ZFP_SOURCE when compiling libzfp to export symbols to Windows DLL
+  list(APPEND zfp_public_defs ZFP_SHARED_LIBS)
   list(APPEND zfp_private_defs ZFP_SOURCE)
 endif()
 
@@ -54,8 +54,7 @@ target_include_directories(zfp
   PUBLIC
     $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  INTERFACE
-    $<BUILD_INTERFACE:${ZFP_SOURCE_DIR}/array>)
+)
 
 set_property(TARGET zfp PROPERTY VERSION ${ZFP_VERSION})
 set_property(TARGET zfp PROPERTY SOVERSION ${ZFP_VERSION_MAJOR})
diff --git a/src/Makefile b/src/Makefile
index 227a7803..239261fb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -21,4 +21,4 @@ $(LIBDIR)/libzfp.so: $(OBJECTS)
 	$(CC) $(CFLAGS) -shared $^ -o $@
 
 .c.o:
-	$(CC) $(CFLAGS) -c $<
+	$(CC) $(CFLAGS) -I../include -c $<
diff --git a/src/bitstream.c b/src/bitstream.c
index 05094c6d..29a4543a 100644
--- a/src/bitstream.c
+++ b/src/bitstream.c
@@ -1,4 +1,4 @@
-#include "bitstream.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.h"
+#include "zfp/bitstream.inl"
 
-export_ const size_t stream_word_bits = wsize;
+const size_t stream_word_bits = wsize;
diff --git a/src/cuda_zfp/CMakeLists.txt b/src/cuda_zfp/CMakeLists.txt
index 2fe402fa..b19546d8 100644
--- a/src/cuda_zfp/CMakeLists.txt
+++ b/src/cuda_zfp/CMakeLists.txt
@@ -18,7 +18,6 @@ set(cuZFP_sources
     type_info.cuh)
 
 set(cuZFP_headers
-    constant_setup.cuh
     shared.h
     cuZFP.h
     ErrorCheck.h)
diff --git a/src/cuda_zfp/constant_setup.cuh b/src/cuda_zfp/constant_setup.cuh
deleted file mode 100644
index 1c1221ad..00000000
--- a/src/cuda_zfp/constant_setup.cuh
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef cuZFP_CONSTANT_SETUP
-#define cuZFP_CONSTANT_SETUP
-
-#include "constants.h"
-#include "shared.h"
-#include "ErrorCheck.h"
-#include "type_info.cuh"
-
-namespace cuZFP {
-
-class ConstantSetup
-{
-public:
-  static void setup_3d()
-  { 
-    ErrorCheck ec;
-    cudaMemcpyToSymbol(c_perm, perm_3d, sizeof(unsigned char) * 64, 0); 
-    ec.chk("setupConst: c_perm");
-  }
-
-  static void setup_2d()
-  {
-    ErrorCheck ec;
-    cudaMemcpyToSymbol(c_perm_2, perm_2, sizeof(unsigned char) * 16, 0); 
-    ec.chk("setupConst: c_perm_2");
-  }
-
-  static void setup_1d()
-  {
-    ErrorCheck ec;
-    cudaMemcpyToSymbol(c_perm_1, perm_1, sizeof(unsigned char) * 4, 0); 
-    ec.chk("setupConst: c_perm_1");
-  }
-};
-
-
-} //namespace 
-
-#endif
diff --git a/src/cuda_zfp/constants.h b/src/cuda_zfp/constants.h
index 423ac91c..a03eb6f8 100644
--- a/src/cuda_zfp/constants.h
+++ b/src/cuda_zfp/constants.h
@@ -5,7 +5,7 @@ namespace cuZFP {
 
 #define index_3d(x, y, z) ((x) + 4 * ((y) + 4 * (z)))
 
-static const unsigned char
+__device__ static const unsigned char
 perm_3d[64] = {
 	index_3d(0, 0, 0), //  0 : 0
 
@@ -94,7 +94,7 @@ perm_3d[64] = {
 
 #undef index_3d
 
-static const unsigned char perm_1[4] = 
+__device__ static const unsigned char perm_1[4] =
 {
   0, 1, 2, 3
 };
@@ -102,7 +102,7 @@ static const unsigned char perm_1[4] =
 #define index(i, j) ((i) + 4 * (j))
 
 /* order coefficients (i, j) by i + j, then i^2 + j^2 */
-static const unsigned char perm_2[16] = {
+__device__ static const unsigned char perm_2[16] = {
   index(0, 0), /*  0 : 0 */
 
   index(1, 0), /*  1 : 1 */
diff --git a/src/cuda_zfp/cuZFP.cu b/src/cuda_zfp/cuZFP.cu
index ffbb9933..e1de467f 100644
--- a/src/cuda_zfp/cuZFP.cu
+++ b/src/cuda_zfp/cuZFP.cu
@@ -12,7 +12,6 @@
 
 #include "ErrorCheck.h"
 
-#include "constant_setup.cuh"
 #include "pointers.cuh"
 #include "type_info.cuh"
 #include <iostream>
@@ -24,7 +23,7 @@
   #define inline_ inline
 #endif
 
-#include "../inline/bitstream.c"
+#include "zfp/bitstream.inl"
 namespace internal 
 { 
   
@@ -119,7 +118,6 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
   {
     int dim = dims[0];
     int sx = stride.x;
-    cuZFP::ConstantSetup::setup_1d();
     stream_size = cuZFP::encode1<T>(dim, sx, d_data, d_stream, bits_per_block); 
   }
   else if(d == 2)
@@ -128,7 +126,6 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
     int2 s;
     s.x = stride.x; 
     s.y = stride.y; 
-    cuZFP::ConstantSetup::setup_2d();
     stream_size = cuZFP::encode2<T>(ndims, s, d_data, d_stream, bits_per_block); 
   }
   else if(d == 3)
@@ -138,7 +135,6 @@ size_t encode(uint dims[3], int3 stride, int bits_per_block, T *d_data, Word *d_
     s.y = stride.y; 
     s.z = stride.z; 
     uint3 ndims = make_uint3(dims[0], dims[1], dims[2]);
-    cuZFP::ConstantSetup::setup_3d();
     stream_size = cuZFP::encode<T>(ndims, s, d_data, d_stream, bits_per_block); 
   }
 
@@ -172,7 +168,6 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
     s.y = stride.y; 
     s.z = stride.z; 
 
-    cuZFP::ConstantSetup::setup_3d();
     stream_bytes = cuZFP::decode3<T>(dims, s, stream, out, bits_per_block); 
   }
   else if(d == 1)
@@ -180,7 +175,6 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
     uint dim = ndims[0];
     int sx = stride.x;
 
-    cuZFP::ConstantSetup::setup_1d();
     stream_bytes = cuZFP::decode1<T>(dim, sx, stream, out, bits_per_block); 
 
   }
@@ -194,7 +188,6 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
     s.x = stride.x; 
     s.y = stride.y; 
 
-    cuZFP::ConstantSetup::setup_2d();
     stream_bytes = cuZFP::decode2<T>(dims, s, stream, out, bits_per_block); 
   }
   else std::cerr<<" d ==  "<<d<<" not implemented\n";
@@ -202,21 +195,37 @@ size_t decode(uint ndims[3], int3 stride, int bits_per_block, Word *stream, T *o
   return stream_bytes;
 }
 
-Word *setup_device_stream(zfp_stream *stream,const zfp_field *field)
+Word *setup_device_stream_compress(zfp_stream *stream,const zfp_field *field)
 {
   bool stream_device = cuZFP::is_gpu_ptr(stream->stream->begin);
-  assert(sizeof(word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
+  assert(sizeof(bitstream_word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
 
   if(stream_device)
   {
     return (Word*) stream->stream->begin;
-  } 
+  }
 
   Word *d_stream = NULL;
-  // TODO: we we have a real stream we can just ask it how big it is
   size_t max_size = zfp_stream_maximum_size(stream, field);
   cudaMalloc(&d_stream, max_size);
-  cudaMemcpy(d_stream, stream->stream->begin, max_size, cudaMemcpyHostToDevice);
+  return d_stream;
+}
+
+Word *setup_device_stream_decompress(zfp_stream *stream,const zfp_field *field)
+{
+  bool stream_device = cuZFP::is_gpu_ptr(stream->stream->begin);
+  assert(sizeof(bitstream_word) == sizeof(Word)); // "CUDA version currently only supports 64bit words");
+
+  if(stream_device)
+  {
+    return (Word*) stream->stream->begin;
+  }
+
+  Word *d_stream = NULL;
+  //TODO: change maximum_size to compressed stream size
+  size_t size = zfp_stream_maximum_size(stream, field);
+  cudaMalloc(&d_stream, size);
+  cudaMemcpy(d_stream, stream->stream->begin, size, cudaMemcpyHostToDevice);
   return d_stream;
 }
 
@@ -246,7 +255,7 @@ void * offset_void(zfp_type type, void *ptr, long long int offset)
   return offset_ptr;
 }
 
-void *setup_device_field(const zfp_field *field, const int3 &stride, long long int &offset)
+void *setup_device_field_compress(const zfp_field *field, const int3 &stride, long long int &offset)
 {
   bool field_device = cuZFP::is_gpu_ptr(field->data);
 
@@ -287,6 +296,43 @@ void *setup_device_field(const zfp_field *field, const int3 &stride, long long i
   return offset_void(field->type, d_data, -offset);
 }
 
+void *setup_device_field_decompress(const zfp_field *field, const int3 &stride, long long int &offset)
+{
+  bool field_device = cuZFP::is_gpu_ptr(field->data);
+
+  if(field_device)
+  {
+    offset = 0;
+    return field->data;
+  }
+
+  uint dims[3];
+  dims[0] = field->nx;
+  dims[1] = field->ny;
+  dims[2] = field->nz;
+
+  size_t type_size = zfp_type_size(field->type);
+
+  size_t field_size = 1;
+  for(int i = 0; i < 3; ++i)
+  {
+    if(dims[i] != 0)
+    {
+      field_size *= dims[i];
+    }
+  }
+
+  bool contig = internal::is_contigous(dims, stride, offset);
+
+  void *d_data = NULL;
+  if(contig)
+  {
+    size_t field_bytes = type_size * field_size;
+    cudaMalloc(&d_data, field_bytes);
+  }
+  return offset_void(field->type, d_data, -offset);
+}
+
 void cleanup_device_ptr(void *orig_ptr, void *d_ptr, size_t bytes, long long int offset, zfp_type type)
 {
   bool device = cuZFP::is_gpu_ptr(orig_ptr);
@@ -323,7 +369,7 @@ cuda_compress(zfp_stream *stream, const zfp_field *field)
   
   size_t stream_bytes = 0;
   long long int offset = 0; 
-  void *d_data = internal::setup_device_field(field, stride, offset);
+  void *d_data = internal::setup_device_field_compress(field, stride, offset);
 
   if(d_data == NULL)
   {
@@ -331,7 +377,7 @@ cuda_compress(zfp_stream *stream, const zfp_field *field)
     return 0;
   }
 
-  Word *d_stream = internal::setup_device_stream(stream, field);
+  Word *d_stream = internal::setup_device_stream_compress(stream, field);
 
   if(field->type == zfp_type_float)
   {
@@ -382,7 +428,7 @@ cuda_decompress(zfp_stream *stream, zfp_field *field)
 
   size_t decoded_bytes = 0;
   long long int offset = 0;
-  void *d_data = internal::setup_device_field(field, stride, offset);
+  void *d_data = internal::setup_device_field_decompress(field, stride, offset);
   
   if(d_data == NULL)
   {
@@ -390,7 +436,7 @@ cuda_decompress(zfp_stream *stream, zfp_field *field)
     return;
   }
 
-  Word *d_stream = internal::setup_device_stream(stream, field);
+  Word *d_stream = internal::setup_device_stream_decompress(stream, field);
 
   if(field->type == zfp_type_float)
   {
@@ -437,7 +483,7 @@ cuda_decompress(zfp_stream *stream, zfp_field *field)
   internal::cleanup_device_ptr(stream->stream->begin, d_stream, 0, 0, field->type);
   internal::cleanup_device_ptr(field->data, d_data, bytes, offset, field->type);
   
-  // this is how zfp determins if this was a success
+  // this is how zfp determines if this was a success
   size_t words_read = decoded_bytes / sizeof(Word);
   stream->stream->bits = wsize;
   // set stream pointer to end of stream
diff --git a/src/cuda_zfp/decode.cuh b/src/cuda_zfp/decode.cuh
index d3d08772..636de7d4 100644
--- a/src/cuda_zfp/decode.cuh
+++ b/src/cuda_zfp/decode.cuh
@@ -6,17 +6,34 @@
 namespace cuZFP
 {
 
-/* map two's complement signed integer to negabinary unsigned integer */
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+// bias values such that truncation is equivalent to round to nearest
+template <typename UInt, uint BlockSize>
+__device__
+static void
+inv_round(UInt* ublock, uint m, uint prec)
+{
+  // add 1/6 ulp to unbias errors
+  if (prec < (uint)(CHAR_BIT * sizeof(UInt) - 1)) {
+    // the first m values (0 <= m <= n) have one more bit of precision
+    uint n = BlockSize - m;
+    while (m--) *ublock++ += (((UInt)NBMASK >> 2) >> prec);
+    while (n--) *ublock++ += (((UInt)NBMASK >> 1) >> prec);
+  }
+}
+#endif
+
+// map two's complement signed integer to negabinary unsigned integer
 inline __device__
 long long int uint2int(unsigned long long int x)
 {
-	return (x ^0xaaaaaaaaaaaaaaaaull) - 0xaaaaaaaaaaaaaaaaull;
+  return (x ^ 0xaaaaaaaaaaaaaaaaull) - 0xaaaaaaaaaaaaaaaaull;
 }
 
 inline __device__
 int uint2int(unsigned int x)
 {
-	return (x ^0xaaaaaaaau) - 0xaaaaaaaau;
+  return (x ^ 0xaaaaaaaau) - 0xaaaaaaaau;
 }
 
 template<int block_size>
@@ -40,10 +57,10 @@ public:
     :  m_maxbits(maxbits), m_valid_block(true)
   {
     if(block_idx >= num_blocks) m_valid_block = false;
-    int word_index = (block_idx * maxbits)  / (sizeof(Word) * 8); 
+    size_t word_index = ((size_t)block_idx * maxbits)  / (sizeof(Word) * 8); 
     m_words = b + word_index;
     m_buffer = *m_words;
-    m_current_bit = (block_idx * maxbits) % (sizeof(Word) * 8); 
+    m_current_bit = ((size_t)block_idx * maxbits) % (sizeof(Word) * 8); 
 
     m_buffer >>= m_current_bit;
     m_block_idx = block_idx;
@@ -95,7 +112,7 @@ public:
       next_read = n_bits - first_read; 
     }
    
-    // this is basically a no-op when first read constained 
+    // this is basically a no-op when first read contained 
     // all the bits. TODO: if we have aligned reads, this could 
     // be a conditional without divergence
     mask = ((Word)1<<((next_read)))-1;
@@ -107,38 +124,59 @@ public:
 
 }; // block reader
 
-template<typename Scalar, int Size, typename UInt>
+template <typename Scalar, uint size, typename UInt>
 inline __device__
-void decode_ints(BlockReader<Size> &reader, uint &max_bits, UInt *data)
+void decode_ints(BlockReader<size> &reader, uint maxbits, UInt *data)
 {
   const int intprec = get_precision<Scalar>();
-  memset(data, 0, sizeof(UInt) * Size);
-  uint64 x; 
   // maxprec = 64;
   const uint kmin = 0; //= intprec > maxprec ? intprec - maxprec : 0;
-  int bits = max_bits;
-  for (uint k = intprec, n = 0; bits && k-- > kmin;)
-  {
-    // read bit plane
-    uint m = MIN(n, bits);
+  uint bits = maxbits;
+  uint k, m, n;
+
+  // initialize data array to all zeros
+  memset(data, 0, size * sizeof(UInt));
+
+  // decode one bit plane at a time from MSB to LSB
+  for (k = intprec, m = n = 0; bits && (m = 0, k-- > kmin);) {
+    // step 1: decode first n bits of bit plane #k
+    m = min(n, bits);
     bits -= m;
-    x = reader.read_bits(m);
-    for (; n < Size && bits && (bits--, reader.read_bit()); x += (Word) 1 << n++)
-      for (; n < (Size - 1) && bits && (bits--, !reader.read_bit()); n++);
-    
-    // deposit bit plane
+    uint64 x = reader.read_bits(m);
+    // step 2: unary run-length decode remainder of bit plane
+    for (; bits && n < size; n++, m = n) {
+      bits--;
+      if (reader.read_bit()) {
+        // positive group test; scan for next one-bit
+        for (; bits && n < size - 1; n++) {
+          bits--;
+          if (reader.read_bit())
+            break;
+        }
+        // set bit and continue decoding bit plane
+        x += (uint64)1 << n;
+      }
+      else {
+        // negative group test; done with bit plane
+        m = size;
+        break;
+      }
+    }
+    // step 3: deposit bit plane from x
 #if (CUDART_VERSION < 8000)
     #pragma unroll
 #else
-    #pragma unroll Size
+    #pragma unroll size
 #endif
-    for (int i = 0; i < Size; i++, x >>= 1)
-    {
+    for (uint i = 0; i < size; i++, x >>= 1)
       data[i] += (UInt)(x & 1u) << k;
-    }
-  } 
-}
+  }
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+  // bias values to achieve proper rounding
+  inv_round<UInt, size>(data, m, intprec - k);
+#endif
+}
 
 template<int BlockSize>
 struct inv_transform;
@@ -149,21 +187,19 @@ struct inv_transform<64>
   template<typename Int>
   __device__ void inv_xform(Int *p)
   {
-    uint x, y, z;
-    /* transform along z */
-    for (y = 0; y < 4; y++)
-      for (x = 0; x < 4; x++)
-        inv_lift<Int,16>(p + 1 * x + 4 * y);
-    /* transform along y */
-    for (x = 0; x < 4; x++)
-      for (z = 0; z < 4; z++)
-        inv_lift<Int,4>(p + 16 * z + 1 * x);
-    /* transform along x */
-    for (z = 0; z < 4; z++)
-      for (y = 0; y < 4; y++)
-        inv_lift<Int,1>(p + 4 * y + 16 * z); 
+    // transform along z
+    for (uint y = 0; y < 4; y++)
+      for (uint x = 0; x < 4; x++)
+        inv_lift<Int, 16>(p + 1 * x + 4 * y);
+    // transform along y
+    for (uint x = 0; x < 4; x++)
+      for (uint z = 0; z < 4; z++)
+        inv_lift<Int, 4>(p + 16 * z + 1 * x);
+    // transform along x
+    for (uint z = 0; z < 4; z++)
+      for (uint y = 0; y < 4; y++)
+        inv_lift<Int, 1>(p + 4 * y + 16 * z); 
   }
-
 };
 
 template<>
@@ -172,17 +208,11 @@ struct inv_transform<16>
   template<typename Int>
   __device__ void inv_xform(Int *p)
   {
-
-    for(int x = 0; x < 4; ++x)
-    {
-      inv_lift<Int,4>(p + 1 * x);
-    }
-    for(int y = 0; y < 4; ++y)
-    {
-      inv_lift<Int,1>(p + 4 * y);
-    }
+    for (uint x = 0; x < 4; ++x)
+      inv_lift<Int, 4>(p + 1 * x);
+    for (uint y = 0; y < 4; ++y)
+      inv_lift<Int, 1>(p + 4 * y);
   }
-
 };
 
 template<>
@@ -191,9 +221,8 @@ struct inv_transform<4>
   template<typename Int>
   __device__ void inv_xform(Int *p)
   {
-    inv_lift<Int,1>(p);
+    inv_lift<Int, 1>(p);
   }
-
 };
 
 template<typename Scalar, int BlockSize>
@@ -227,39 +256,34 @@ __device__ void zfp_decode(BlockReader<BlockSize> &reader, Scalar *fblock, uint
       ebits = 0;
     }
 
-	  maxbits -= ebits;
-    
+    maxbits -= ebits;
+
     UInt ublock[BlockSize];
 
     decode_ints<Scalar, BlockSize, UInt>(reader, maxbits, ublock);
 
     Int iblock[BlockSize];
-    unsigned char *perm = get_perm<BlockSize>();
+    const unsigned char *perm = get_perm<BlockSize>();
 #if (CUDART_VERSION < 8000)
     #pragma unroll 
 #else
     #pragma unroll BlockSize
 #endif
-    for(int i = 0; i < BlockSize; ++i)
-    {
-		  iblock[perm[i]] = uint2int(ublock[i]);
-    }
+    for (int i = 0; i < BlockSize; ++i)
+      iblock[perm[i]] = uint2int(ublock[i]);
     
     inv_transform<BlockSize> trans;
     trans.inv_xform(iblock);
 
-		Scalar inv_w = dequantize<Int, Scalar>(1, emax);
+    Scalar inv_w = dequantize<Int, Scalar>(1, emax);
 
 #if (CUDART_VERSION < 8000)
     #pragma unroll 
 #else
     #pragma unroll BlockSize
 #endif
-    for(int i = 0; i < BlockSize; ++i)
-    {
-		  fblock[i] = inv_w * (Scalar)iblock[i];
-    }
-     
+    for (int i = 0; i < BlockSize; ++i)
+      fblock[i] = inv_w * (Scalar)iblock[i];
   }
 }
 
diff --git a/src/cuda_zfp/decode1.cuh b/src/cuda_zfp/decode1.cuh
index 996d9ed1..6d357f63 100644
--- a/src/cuda_zfp/decode1.cuh
+++ b/src/cuda_zfp/decode1.cuh
@@ -13,8 +13,8 @@ __device__ __host__ inline
 void scatter_partial1(const Scalar* q, Scalar* p, int nx, int sx)
 {
   uint x;
-  for (x = 0; x < nx; x++, p += sx)
-   *p = *q++;
+  for (x = 0; x < 4; x++)
+    if (x < nx) p[x * sx] = q[x];
 }
 
 template<typename Scalar> 
@@ -127,9 +127,9 @@ size_t decode1launch(uint dim,
   cudaEventSynchronize(stop);
 	cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dim) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/decode2.cuh b/src/cuda_zfp/decode2.cuh
index 41e112b5..fa60a82f 100644
--- a/src/cuda_zfp/decode2.cuh
+++ b/src/cuda_zfp/decode2.cuh
@@ -12,9 +12,15 @@ __device__ __host__ inline
 void scatter_partial2(const Scalar* q, Scalar* p, int nx, int ny, int sx, int sy)
 {
   uint x, y;
-  for (y = 0; y < ny; y++, p += sy - nx * sx, q += 4 - nx)
-    for (x = 0; x < nx; x++, p += sx, q++)
-      *p = *q;
+  for (y = 0; y < 4; y++)
+    if (y < ny) {
+      for (x = 0; x < 4; x++)
+        if (x < nx) {
+          *p = q[4 * y + x];
+          p += sx;
+        }
+      p += sy - nx * sx;
+    }
 }
 
 template<typename Scalar> 
@@ -144,9 +150,9 @@ size_t decode2launch(uint2 dims,
   cudaEventSynchronize(stop);
 	cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dims.x * dims.y) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/decode3.cuh b/src/cuda_zfp/decode3.cuh
index 2a3ef008..9f2a98a8 100644
--- a/src/cuda_zfp/decode3.cuh
+++ b/src/cuda_zfp/decode3.cuh
@@ -12,10 +12,19 @@ __device__ __host__ inline
 void scatter_partial3(const Scalar* q, Scalar* p, int nx, int ny, int nz, int sx, int sy, int sz)
 {
   uint x, y, z;
-  for (z = 0; z < nz; z++, p += sz - ny * sy, q += 4 * (4 - ny))
-    for (y = 0; y < ny; y++, p += sy - nx * sx, q += 4 - nx)
-      for (x = 0; x < nx; x++, p += sx, q++)
-        *p = *q;
+  for (z = 0; z < 4; z++)
+    if (z < nz) {
+      for (y = 0; y < 4; y++)
+        if (y < ny) {
+          for (x = 0; x < 4; x++)
+            if (x < nx) {
+              *p = q[16 * z + 4 * y + x];
+              p += sx;
+            }
+          p += sy - nx * sx;
+        }
+      p += sz - ny * sy;
+    }
 }
 
 template<typename Scalar> 
@@ -154,9 +163,9 @@ size_t decode3launch(uint3 dims,
   cudaEventSynchronize(stop);
 	cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/encode.cuh b/src/cuda_zfp/encode.cuh
index c65bd356..995c9c32 100644
--- a/src/cuda_zfp/encode.cuh
+++ b/src/cuda_zfp/encode.cuh
@@ -11,7 +11,7 @@ __device__
 static int
 precision(int maxexp, int maxprec, int minexp)
 {
-  return MIN(maxprec, MAX(0, maxexp - minexp + 8));
+  return min(maxprec, max(0, maxexp - minexp + 8));
 }
 
 template<typename Scalar>
@@ -42,13 +42,19 @@ __device__
 static int
 exponent(Scalar x)
 {
+  int e = -get_ebias<Scalar>();
+#ifdef ZFP_WITH_DAZ
+  // treat subnormals as zero; resolves issue #119 by avoiding overflow
+  if (x >= get_scalar_min<Scalar>())
+    frexp(x, &e);
+#else
   if (x > 0) {
-    int e;
     frexp(x, &e);
-    // clamp exponent in case x is denormalized
-    return max(e, 1 - get_ebias<Scalar>());
+    // clamp exponent in case x is subnormal; may still result in overflow
+    e = max(e, 1 - get_ebias<Scalar>());
   }
-  return -get_ebias<Scalar>();
+#endif
+  return e;
 }
 
 template<class Scalar, int BlockSize>
@@ -57,10 +63,9 @@ static int
 max_exponent(const Scalar* p)
 {
   Scalar max_val = 0;
-  for(int i = 0; i < BlockSize; ++i)
-  {
+  for (int i = 0; i < BlockSize; ++i) {
     Scalar f = fabs(p[i]);
-    max_val = max(max_val,f);
+    max_val = max(max_val, f);
   }
   return exponent<Scalar>(max_val);
 }
@@ -93,6 +98,25 @@ fwd_lift(Int* p)
   p -= s; *p = x;
 }
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_FIRST
+// bias values such that truncation is equivalent to round to nearest
+template <typename Int, uint BlockSize>
+__device__
+static void
+fwd_round(Int* iblock, uint maxprec)
+{
+  // add or subtract 1/6 ulp to unbias errors
+  if (maxprec < (uint)(CHAR_BIT * sizeof(Int))) {
+    Int bias = (static_cast<typename zfp_traits<Int>::UInt>(NBMASK) >> 2) >> maxprec;
+    uint n = BlockSize;
+    if (maxprec & 1u)
+      do *iblock++ += bias; while (--n);
+    else
+      do *iblock++ -= bias; while (--n);
+  }
+}
+#endif
+
 template<typename Scalar>
 Scalar
 inline __device__
@@ -103,7 +127,7 @@ float
 inline __device__
 quantize_factor<float>(const int &exponent, float)
 {
-	return  LDEXP(1.0, get_precision<float>() - 2 - exponent);
+  return LDEXP(1.0, get_precision<float>() - 2 - exponent);
 }
 
 template<>
@@ -111,13 +135,13 @@ double
 inline __device__
 quantize_factor<double>(const int &exponent, double)
 {
-	return  LDEXP(1.0, get_precision<double>() - 2 - exponent);
+  return LDEXP(1.0, get_precision<double>() - 2 - exponent);
 }
 
 template<typename Scalar, typename Int, int BlockSize>
 void __device__ fwd_cast(Int *iblock, const Scalar *fblock, int emax)
 {
-	Scalar s = quantize_factor(emax, Scalar());
+  Scalar s = quantize_factor(emax, Scalar());
   for(int i = 0; i < BlockSize; ++i)
   {
     iblock[i] = (Int) (s * fblock[i]);
@@ -133,7 +157,6 @@ struct transform<64>
   template<typename Int>
   __device__ void fwd_xform(Int *p)
   {
-
     uint x, y, z;
     /* transform along x */
     for (z = 0; z < 4; z++)
@@ -149,7 +172,6 @@ struct transform<64>
         fwd_lift<Int,16>(p + 1 * x + 4 * y);
 
    }
-
 };
 
 template<>
@@ -158,16 +180,14 @@ struct transform<16>
   template<typename Int>
   __device__ void fwd_xform(Int *p)
   {
-
     uint x, y;
     /* transform along x */
     for (y = 0; y < 4; y++)
-     fwd_lift<Int,1>(p + 4 * y);
+      fwd_lift<Int,1>(p + 4 * y);
     /* transform along y */
     for (x = 0; x < 4; x++)
       fwd_lift<Int,4>(p + 1 * x);
-    }
-
+  }
 };
 
 template<>
@@ -178,14 +198,14 @@ struct transform<4>
   {
     fwd_lift<Int,1>(p);
   }
-
 };
 
 template<typename Int, typename UInt, int BlockSize>
 __device__ void fwd_order(UInt *ublock, const Int *iblock)
 {
-  unsigned char *perm = get_perm<BlockSize>();
-  for(int i = 0; i < BlockSize; ++i)
+  const unsigned char *perm = get_perm<BlockSize>();
+
+  for (int i = 0; i < BlockSize; ++i)
   {
     ublock[i] = int2uint(iblock[perm[i]]);
   }
@@ -206,8 +226,8 @@ struct BlockWriter
       m_maxbits(maxbits),
       m_stream(stream)
   {
-    m_word_index = (block_idx * maxbits)  / (sizeof(Word) * 8); 
-    m_start_bit = uint((block_idx * maxbits) % (sizeof(Word) * 8)); 
+    m_word_index = ((size_t)block_idx * maxbits)  / (sizeof(Word) * 8); 
+    m_start_bit = uint(((size_t)block_idx * maxbits) % (sizeof(Word) * 8)); 
   }
 
   template<typename T>
@@ -289,41 +309,39 @@ void inline __device__ encode_block(BlockWriter<BlockSize> &stream,
                                     int maxprec,
                                     Int *iblock)
 {
+  // perform decorrelating transform
   transform<BlockSize> tform;
   tform.fwd_xform(iblock);
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_FIRST
+  // bias values to achieve proper rounding
+  fwd_round<Int, BlockSize>(iblock, maxprec);
+#endif
+
+  // reorder signed coefficients and convert to unsigned integer
   typedef typename zfp_traits<Int>::UInt UInt;
   UInt ublock[BlockSize]; 
   fwd_order<Int, UInt, BlockSize>(ublock, iblock);
 
-  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  // encode integer coefficients
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint bits = maxbits;
-  uint i, k, m, n;
-  uint64 x;
 
-  for (k = intprec, n = 0; bits && k-- > kmin;) {
-    /* step 1: extract bit plane #k to x */
-    x = 0;
-    for (i = 0; i < BlockSize; i++)
-    {
+  for (uint k = intprec, n = 0; bits && k-- > kmin;) {
+    // step 1: extract bit plane #k to x
+    uint64 x = 0;
+    for (uint i = 0; i < BlockSize; i++)
       x += (uint64)((ublock[i] >> k) & 1u) << i;
-    }
-    /* step 2: encode first n bits of bit plane */
-    m = min(n, bits);
-    //uint temp  = bits;
+    // step 2: encode first n bits of bit plane
+    uint m = min(n, bits);
     bits -= m;
     x = stream.write_bits(x, m);
-    
-    /* step 3: unary run-length encode remainder of bit plane */
+    // step 3: unary run-length encode remainder of bit plane
     for (; n < BlockSize && bits && (bits--, stream.write_bit(!!x)); x >>= 1, n++)
-    {
       for (; n < BlockSize - 1 && bits && (bits--, !stream.write_bit(x & 1u)); x >>= 1, n++)
-      {  
-      }
-    }
+        ;
   }
-  
 }
 
 template<typename Scalar, int BlockSize>
diff --git a/src/cuda_zfp/encode1.cuh b/src/cuda_zfp/encode1.cuh
index 9353f8c0..98ce5a75 100644
--- a/src/cuda_zfp/encode1.cuh
+++ b/src/cuda_zfp/encode1.cuh
@@ -17,8 +17,8 @@ __device__ __host__ inline
 void gather_partial1(Scalar* q, const Scalar* p, int nx, int sx)
 {
   uint x;
-  for (x = 0; x < nx; x++, p += sx)
-    q[x] = *p;
+  for (x = 0; x < 4; x++)
+    if (x < nx) q[x] = p[x * sx];
   pad_block(q, nx, 1);
 }
 
@@ -131,7 +131,7 @@ size_t encode1launch(uint dim,
   cudaEventRecord(start);
 #endif
 
-	cudaEncode1<Scalar> << <grid_size, block_size>> >
+  cudaEncode1<Scalar> <<<grid_size, block_size>>>
     (maxbits,
      d_data,
      stream,
@@ -145,9 +145,9 @@ size_t encode1launch(uint dim,
   cudaEventSynchronize(stop);
   cudaStreamSynchronize(0);
 
-  float miliseconds = 0.f;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0.f;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float gb = (float(dim) * float(sizeof(Scalar))) / (1024.f * 1024.f * 1024.f);
   float rate = gb / seconds;
   printf("Encode elapsed time: %.5f (s)\n", seconds);
diff --git a/src/cuda_zfp/encode2.cuh b/src/cuda_zfp/encode2.cuh
index 7d9ebfe0..0d577d51 100644
--- a/src/cuda_zfp/encode2.cuh
+++ b/src/cuda_zfp/encode2.cuh
@@ -17,11 +17,16 @@ __device__ __host__ inline
 void gather_partial2(Scalar* q, const Scalar* p, int nx, int ny, int sx, int sy)
 {
   uint x, y;
-  for (y = 0; y < ny; y++, p += sy - nx * sx) {
-    for (x = 0; x < nx; x++, p += sx)
-      q[4 * y + x] = *p;
+  for (y = 0; y < 4; y++)
+    if (y < ny) {
+      for (x = 0; x < 4; x++)
+        if (x < nx) {
+          q[4 * y + x] = *p;//[x * sx];
+          p += sx;
+        }
       pad_block(q + 4 * y, nx, 1);
-  }
+      p += sy - nx * sx;
+    }
   for (x = 0; x < 4; x++)
     pad_block(q + x, ny, 4);
 }
@@ -143,7 +148,7 @@ size_t encode2launch(uint2 dims,
   cudaEventRecord(start);
 #endif
 
-	cudaEncode2<Scalar> << <grid_size, block_size>> >
+  cudaEncode2<Scalar> <<<grid_size, block_size>>>
     (maxbits,
      d_data,
      stream,
@@ -158,9 +163,9 @@ size_t encode2launch(uint2 dims,
   cudaEventSynchronize(stop);
   cudaStreamSynchronize(0);
 
-  float miliseconds = 0.f;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0.f;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float mb = (float(dims.x * dims.y) * sizeof(Scalar)) / (1024.f * 1024.f *1024.f);
   float rate = mb / seconds;
   printf("Encode elapsed time: %.5f (s)\n", seconds);
diff --git a/src/cuda_zfp/encode3.cuh b/src/cuda_zfp/encode3.cuh
index 9fe7ddd2..1edee9e9 100644
--- a/src/cuda_zfp/encode3.cuh
+++ b/src/cuda_zfp/encode3.cuh
@@ -14,15 +14,22 @@ __device__ __host__ inline
 void gather_partial3(Scalar* q, const Scalar* p, int nx, int ny, int nz, int sx, int sy, int sz)
 {
   uint x, y, z;
-  for (z = 0; z < nz; z++, p += sz - ny * sy) {
-    for (y = 0; y < ny; y++, p += sy - nx * sx) {
-      for (x = 0; x < nx; x++, p += sx)
-        q[16 * z + 4 * y + x] = *p; 
-        pad_block(q + 16 * z + 4 * y, nx, 1);
+  for (z = 0; z < 4; z++)
+    if (z < nz) {
+      for (y = 0; y < 4; y++)
+        if (y < ny) {
+          for (x = 0; x < 4; x++)
+            if (x < nx) {
+              q[16 * z + 4 * y + x] = *p;
+              p += sx;
+          }
+          p += sy - nx * sx;
+          pad_block(q + 16 * z + 4 * y, nx, 1);
+        }
+      for (x = 0; x < 4; x++)
+        pad_block(q + 16 * z + x, ny, 4);
+      p += sz - ny * sy;
     }
-    for (x = 0; x < 4; x++)
-      pad_block(q + 16 * z + x, ny, 4);
-  }
   for (y = 0; y < 4; y++)
     for (x = 0; x < 4; x++)
       pad_block(q + 4 * y + x, nz, 16);
@@ -150,7 +157,7 @@ size_t encode3launch(uint3 dims,
   cudaEventRecord(start);
 #endif
 
-	cudaEncode<Scalar> << <grid_size, block_size>> >
+  cudaEncode<Scalar> <<<grid_size, block_size>>>
     (maxbits,
      d_data,
      stream,
@@ -164,9 +171,9 @@ size_t encode3launch(uint3 dims,
   cudaEventSynchronize(stop);
   cudaStreamSynchronize(0);
 
-  float miliseconds = 0;
-  cudaEventElapsedTime(&miliseconds, start, stop);
-  float seconds = miliseconds / 1000.f;
+  float milliseconds = 0;
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float seconds = milliseconds / 1000.f;
   float rate = (float(dims.x * dims.y * dims.z) * sizeof(Scalar) ) / seconds;
   rate /= 1024.f;
   rate /= 1024.f;
diff --git a/src/cuda_zfp/shared.h b/src/cuda_zfp/shared.h
index 52de03ad..27df25be 100644
--- a/src/cuda_zfp/shared.h
+++ b/src/cuda_zfp/shared.h
@@ -7,20 +7,17 @@ typedef unsigned long long Word;
 
 #include "type_info.cuh"
 #include "zfp.h"
+#include "constants.h"
 #include <stdio.h>
 
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
-#define bitsize(x) (CHAR_BIT * (uint)sizeof(x))
+#define bitsize(x) ((uint)(CHAR_BIT * sizeof(x)))
 
 #define LDEXP(x, e) ldexp(x, e)
 
 #define NBMASK 0xaaaaaaaaaaaaaaaaull
 
-__constant__ unsigned char c_perm_1[4];
-__constant__ unsigned char c_perm_2[16];
-__constant__ unsigned char c_perm[64];
-
 namespace cuZFP
 {
 
@@ -87,9 +84,17 @@ size_t calc_device_mem3d(const uint3 encoded_dims,
 
 dim3 get_max_grid_dims()
 {
-  cudaDeviceProp prop; 
-  int device = 0;
-  cudaGetDeviceProperties(&prop, device);
+  static cudaDeviceProp prop;
+  static bool firstTime = true;
+
+  if( firstTime )
+  {
+    firstTime = false;
+
+    int device = 0;
+    cudaGetDeviceProperties(&prop, device);
+  }
+
   dim3 grid_dims;
   grid_dims.x = prop.maxGridSize[0];
   grid_dims.y = prop.maxGridSize[1];
@@ -126,7 +131,7 @@ dim3 calculate_grid_size(size_t size, size_t cuda_block_size)
   if(dims == 2)
   {
     float sq_r = sqrt((float)grids);
-    float intpart = 0.;
+    float intpart = 0;
     modf(sq_r,&intpart); 
     uint base = intpart;
     grid_size.x = base; 
@@ -141,7 +146,7 @@ dim3 calculate_grid_size(size_t size, size_t cuda_block_size)
   if(dims == 3)
   {
     float cub_r = pow((float)grids, 1.f/3.f);;
-    float intpart = 0.;
+    float intpart = 0;
     modf(cub_r,&intpart); 
     int base = intpart;
     grid_size.x = base; 
@@ -185,7 +190,7 @@ __device__
 double
 dequantize<long long int, double>(const long long int &x, const int &e)
 {
-	return LDEXP((double)x, e - (CHAR_BIT * scalar_sizeof<double>() - 2));
+	return LDEXP((double)x, e - ((int)(CHAR_BIT * scalar_sizeof<double>()) - 2));
 }
 
 template<>
@@ -193,7 +198,7 @@ __device__
 float
 dequantize<int, float>(const int &x, const int &e)
 {
-	return LDEXP((float)x, e - (CHAR_BIT * scalar_sizeof<float>() - 2));
+	return LDEXP((float)x, e - ((int)(CHAR_BIT * scalar_sizeof<float>()) - 2));
 }
 
 template<>
@@ -245,28 +250,28 @@ inv_lift(Int* p)
 
 
 template<int BlockSize>
-__device__
-unsigned char* get_perm();
+__device__ inline
+const unsigned char* get_perm();
 
 template<>
-__device__
-unsigned char* get_perm<64>()
+__device__ inline
+const unsigned char* get_perm<64>()
 {
-  return c_perm;
+  return perm_3d;
 }
 
 template<>
-__device__
-unsigned char* get_perm<16>()
+__device__ inline
+const unsigned char* get_perm<16>()
 {
-  return c_perm_2;
+  return perm_2;
 }
 
 template<>
-__device__
-unsigned char* get_perm<4>()
+__device__ inline
+const unsigned char* get_perm<4>()
 {
-  return c_perm_1;
+  return perm_1;
 }
 
 
diff --git a/src/cuda_zfp/type_info.cuh b/src/cuda_zfp/type_info.cuh
index 969f5532..25d76922 100644
--- a/src/cuda_zfp/type_info.cuh
+++ b/src/cuda_zfp/type_info.cuh
@@ -1,6 +1,8 @@
 #ifndef cuZFP_TYPE_INFO
 #define cuZFP_TYPE_INFO
 
+#include <cfloat>
+
 namespace cuZFP {
 
 template<typename T> inline __host__ __device__ int get_ebias();
@@ -27,13 +29,22 @@ template<> inline __host__ __device__ int get_min_exp<float>() { return -1074; }
 template<> inline __host__ __device__ int get_min_exp<long long int>() { return 0; }
 template<> inline __host__ __device__ int get_min_exp<int>() { return 0; }
 
-template<typename T> inline __host__ __device__ int scalar_sizeof();
+template<typename T> inline __host__ __device__ T get_scalar_min();
+template<> inline __host__ __device__ float get_scalar_min<float>() { return FLT_MIN; }
+template<> inline __host__ __device__ double get_scalar_min<double>() { return DBL_MIN; }
+template<> inline __host__ __device__ long long int get_scalar_min<long long int>() { return 0; }
+template<> inline __host__ __device__ int get_scalar_min<int>() { return 0; }
 
+template<typename T> inline __host__ __device__ int scalar_sizeof();
 template<> inline __host__ __device__ int scalar_sizeof<double>() { return 8; }
 template<> inline __host__ __device__ int scalar_sizeof<long long int>() { return 8; }
 template<> inline __host__ __device__ int scalar_sizeof<float>() { return 4; }
 template<> inline __host__ __device__ int scalar_sizeof<int>() { return 4; }
 
+template<typename T> inline __host__ __device__ T get_nbmask();
+template<> inline __host__ __device__ unsigned int get_nbmask<unsigned int>() { return 0xaaaaaaaau; }
+template<> inline __host__ __device__ unsigned long long int get_nbmask<unsigned long long int>() { return 0xaaaaaaaaaaaaaaaaull; }
+
 template<typename T> struct zfp_traits;
 
 template<> struct zfp_traits<double>
@@ -75,6 +86,7 @@ template<> inline __host__ __device__ bool is_int<long long int>()
   return true;
 }
 
+#if 0
 template<int T> struct block_traits;
 
 template<> struct block_traits<1>
@@ -86,7 +98,8 @@ template<> struct block_traits<2>
 {
   typedef unsigned short PlaneType;
 };
-
+#endif
 
 } // namespace cuZFP
+
 #endif
diff --git a/src/decode1d.c b/src/decode1d.c
index 436515a9..b95995fa 100644
--- a/src/decode1d.c
+++ b/src/decode1d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
 #include "template/decode.c"
diff --git a/src/decode1f.c b/src/decode1f.c
index 443b8522..f08119f7 100644
--- a/src/decode1f.c
+++ b/src/decode1f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
 #include "template/decode.c"
diff --git a/src/decode1i.c b/src/decode1i.c
index 73f58e6c..b148641e 100644
--- a/src/decode1i.c
+++ b/src/decode1i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec1.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode1l.c b/src/decode1l.c
index cedcc532..d79e8e46 100644
--- a/src/decode1l.c
+++ b/src/decode1l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec1.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode2d.c b/src/decode2d.c
index 8c3a994d..d7f3a77c 100644
--- a/src/decode2d.c
+++ b/src/decode2d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
 #include "template/decode.c"
diff --git a/src/decode2f.c b/src/decode2f.c
index 7b3c35cf..5d44e072 100644
--- a/src/decode2f.c
+++ b/src/decode2f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
 #include "template/decode.c"
diff --git a/src/decode2i.c b/src/decode2i.c
index 70a4a5a2..579eaa82 100644
--- a/src/decode2i.c
+++ b/src/decode2i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec2.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode2l.c b/src/decode2l.c
index 93a2cf83..b4d871f5 100644
--- a/src/decode2l.c
+++ b/src/decode2l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec2.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode3d.c b/src/decode3d.c
index b8cb9d18..e9291aa4 100644
--- a/src/decode3d.c
+++ b/src/decode3d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
 #include "template/decode.c"
diff --git a/src/decode3f.c b/src/decode3f.c
index 914c4999..cc517b13 100644
--- a/src/decode3f.c
+++ b/src/decode3f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
 #include "template/decode.c"
diff --git a/src/decode3i.c b/src/decode3i.c
index 46af93e0..0eb05dea 100644
--- a/src/decode3i.c
+++ b/src/decode3i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec3.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode3l.c b/src/decode3l.c
index 1e76d171..d895d0e7 100644
--- a/src/decode3l.c
+++ b/src/decode3l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec3.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode4d.c b/src/decode4d.c
index ee5b31fc..38861b5d 100644
--- a/src/decode4d.c
+++ b/src/decode4d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
 #include "template/decode.c"
diff --git a/src/decode4f.c b/src/decode4f.c
index 5eb3b900..7ef87f10 100644
--- a/src/decode4f.c
+++ b/src/decode4f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
 #include "template/decode.c"
diff --git a/src/decode4i.c b/src/decode4i.c
index b871eba4..ade99493 100644
--- a/src/decode4i.c
+++ b/src/decode4i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec4.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/decode4l.c b/src/decode4l.c
index b37e47e1..bbbdefbb 100644
--- a/src/decode4l.c
+++ b/src/decode4l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec4.c"
 #include "template/decode.c"
 #include "template/decodei.c"
diff --git a/src/encode1d.c b/src/encode1d.c
index 84b9ac8c..43f5101c 100644
--- a/src/encode1d.c
+++ b/src/encode1d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
 #include "template/encode.c"
diff --git a/src/encode1f.c b/src/encode1f.c
index a57a7cf7..ae509d53 100644
--- a/src/encode1f.c
+++ b/src/encode1f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec1.c"
 #include "template/encode.c"
diff --git a/src/encode1i.c b/src/encode1i.c
index dcd9aa64..ea3593cd 100644
--- a/src/encode1i.c
+++ b/src/encode1i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec1.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode1l.c b/src/encode1l.c
index 032c3de6..e9415e20 100644
--- a/src/encode1l.c
+++ b/src/encode1l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block1.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec1.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode2d.c b/src/encode2d.c
index 50e8dd83..8f445892 100644
--- a/src/encode2d.c
+++ b/src/encode2d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
 #include "template/encode.c"
diff --git a/src/encode2f.c b/src/encode2f.c
index 713a74e4..814a18a2 100644
--- a/src/encode2f.c
+++ b/src/encode2f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec2.c"
 #include "template/encode.c"
diff --git a/src/encode2i.c b/src/encode2i.c
index d0b4b54c..8417031f 100644
--- a/src/encode2i.c
+++ b/src/encode2i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec2.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode2l.c b/src/encode2l.c
index d834cfa1..87f5a2f3 100644
--- a/src/encode2l.c
+++ b/src/encode2l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block2.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec2.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode3d.c b/src/encode3d.c
index 16c385e2..55f55d62 100644
--- a/src/encode3d.c
+++ b/src/encode3d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
 #include "template/encode.c"
diff --git a/src/encode3f.c b/src/encode3f.c
index 1668aff8..de3bbaf4 100644
--- a/src/encode3f.c
+++ b/src/encode3f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec3.c"
 #include "template/encode.c"
diff --git a/src/encode3i.c b/src/encode3i.c
index c92a1a69..257a1ecd 100644
--- a/src/encode3i.c
+++ b/src/encode3i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec3.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode3l.c b/src/encode3l.c
index 4d53304e..c6269699 100644
--- a/src/encode3l.c
+++ b/src/encode3l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block3.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec3.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode4d.c b/src/encode4d.c
index c82d19a1..346f1747 100644
--- a/src/encode4d.c
+++ b/src/encode4d.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsd.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
 #include "template/encode.c"
diff --git a/src/encode4f.c b/src/encode4f.c
index e0ce0146..b855262b 100644
--- a/src/encode4f.c
+++ b/src/encode4f.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsf.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codecf.c"
 #include "template/codec4.c"
 #include "template/encode.c"
diff --git a/src/encode4i.c b/src/encode4i.c
index ab82e0e2..5bed6cdf 100644
--- a/src/encode4i.c
+++ b/src/encode4i.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsi.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec4.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/encode4l.c b/src/encode4l.c
index 805ee01a..fd84e5a1 100644
--- a/src/encode4l.c
+++ b/src/encode4l.c
@@ -1,11 +1,12 @@
-#include "inline/inline.h"
+#include "zfp/internal/zfp/inline.h"
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 #include "block4.h"
 #include "traitsl.h"
 #include "template/template.h"
 #include "template/codec.h"
-#include "inline/bitstream.c"
+#include "zfp/bitstream.inl"
+#include "template/codec.c"
 #include "template/codec4.c"
 #include "template/encode.c"
 #include "template/encodei.c"
diff --git a/src/share/omp.c b/src/share/omp.c
index 9ee26b9a..02507e56 100644
--- a/src/share/omp.c
+++ b/src/share/omp.c
@@ -1,11 +1,13 @@
 #ifdef _OPENMP
+#include <limits.h>
 #include <omp.h>
+#include "zfp.h"
 
 /* number of omp threads to use */
-static int
+static uint
 thread_count_omp(const zfp_stream* stream)
 {
-  int count = stream->exec.params.omp.threads;
+  uint count = zfp_stream_omp_threads(stream);
   /* if no thread count is specified, use default number of threads */
   if (!count)
     count = omp_get_max_threads();
@@ -13,13 +15,17 @@ thread_count_omp(const zfp_stream* stream)
 }
 
 /* number of chunks to partition array into */
-static uint
-chunk_count_omp(const zfp_stream* stream, uint blocks, uint threads)
+static size_t
+chunk_count_omp(const zfp_stream* stream, size_t blocks, uint threads)
 {
-  uint chunk_size = stream->exec.params.omp.chunk_size;
+  size_t chunk_size = (size_t)zfp_stream_omp_chunk_size(stream);
   /* if no chunk size is specified, assign one chunk per thread */
-  uint chunks = chunk_size ? (blocks + chunk_size - 1) / chunk_size : threads;
-  return MIN(chunks, blocks);
+  size_t chunks = chunk_size ? (blocks + chunk_size - 1) / chunk_size : threads;
+  /* each chunk must contain at least one block */
+  chunks = MIN(chunks, blocks);
+  /* OpenMP 2.0 loop counters must be ints */
+  chunks = MIN(chunks, INT_MAX);
+  return chunks;
 }
 
 #endif
diff --git a/src/share/parallel.c b/src/share/parallel.c
index e778ac7c..1ae36526 100644
--- a/src/share/parallel.c
+++ b/src/share/parallel.c
@@ -1,41 +1,42 @@
 #ifdef _OPENMP
 
 /* block index at which chunk begins */
-static uint
-chunk_offset(uint blocks, uint chunks, uint chunk)
+static size_t
+chunk_offset(size_t blocks, size_t chunks, size_t chunk)
 {
-  return (uint)((blocks * (uint64)chunk) / chunks);
+  return (size_t)(((uint64)blocks * (uint64)chunk) / chunks);
 }
 
 /* initialize per-thread bit streams for parallel compression */
 static bitstream**
-compress_init_par(zfp_stream* stream, const zfp_field* field, uint chunks, uint blocks)
+compress_init_par(zfp_stream* stream, const zfp_field* field, size_t chunks, size_t blocks)
 {
   bitstream** bs;
+  zfp_bool copy;
+  size_t n = 4 * (blocks + chunks - 1) / chunks;
   size_t size;
-  int copy = 0;
-  uint i;
+  size_t chunk;
 
   /* determine maximum size buffer needed per thread */
   zfp_field f = *field;
   switch (zfp_field_dimensionality(field)) {
     case 1:
-      f.nx = 4 * (blocks + chunks - 1) / chunks;
+      f.nx = n;
       break;
     case 2:
       f.nx = 4;
-      f.ny = 4 * (blocks + chunks - 1) / chunks;
+      f.ny = n;
       break;
     case 3:
       f.nx = 4;
       f.ny = 4;
-      f.nz = 4 * (blocks + chunks - 1) / chunks;
+      f.nz = n;
       break;
     case 4:
       f.nx = 4;
       f.ny = 4;
       f.nz = 4;
-      f.nw = 4 * (blocks + chunks - 1) / chunks;
+      f.nw = n;
       break;
     default:
       return NULL;
@@ -43,27 +44,27 @@ compress_init_par(zfp_stream* stream, const zfp_field* field, uint chunks, uint
   size = zfp_stream_maximum_size(stream, &f);
 
   /* avoid copies in fixed-rate mode when each bitstream is word aligned */
-  copy |= stream->minbits != stream->maxbits;
-  copy |= (stream->maxbits % stream_word_bits) != 0;
-  copy |= (stream_wtell(stream->stream) % stream_word_bits) != 0;
+  copy = (stream->minbits != stream->maxbits) ||
+         (stream->maxbits % stream_word_bits != 0) ||
+         (stream_wtell(stream->stream) % stream_word_bits != 0);
 
   /* set up buffer for each thread to compress to */
   bs = (bitstream**)malloc(chunks * sizeof(bitstream*));
   if (!bs)
     return NULL;
-  for (i = 0; i < chunks; i++) {
-    uint block = chunk_offset(blocks, chunks, i);
-    void* buffer = copy ? malloc(size) : (uchar*)stream_data(stream->stream) + stream_size(stream->stream) + block * stream->maxbits / CHAR_BIT;
+  for (chunk = 0; chunk < chunks; chunk++) {
+    size_t block = chunk_offset(blocks, chunks, chunk);
+    void* buffer = copy ? malloc(size) : (uchar*)stream_data(stream->stream) + stream_size(stream->stream) + block * (stream->maxbits / CHAR_BIT);
     if (!buffer)
       break;
-    bs[i] = stream_open(buffer, size);
+    bs[chunk] = stream_open(buffer, size);
   }
 
   /* handle memory allocation failure */
-  if (copy && i < chunks) {
-    while (i--) {
-      free(stream_data(bs[i]));
-      stream_close(bs[i]);
+  if (copy && chunk < chunks) {
+    while (chunk--) {
+      free(stream_data(bs[chunk]));
+      stream_close(bs[chunk]);
     }
     free(bs);
     bs = NULL;
@@ -74,24 +75,27 @@ compress_init_par(zfp_stream* stream, const zfp_field* field, uint chunks, uint
 
 /* flush and concatenate bit streams if needed */
 static void
-compress_finish_par(zfp_stream* stream, bitstream** src, uint chunks)
+compress_finish_par(zfp_stream* stream, bitstream** src, size_t chunks)
 {
   bitstream* dst = zfp_stream_bit_stream(stream);
-  int copy = (stream_data(dst) != stream_data(*src));
-  size_t offset = stream_wtell(dst);
-  uint i;
-  for (i = 0; i < chunks; i++) {
-    size_t bits = stream_wtell(src[i]);
+  zfp_bool copy = (stream_data(dst) != stream_data(*src));
+  bitstream_offset offset = stream_wtell(dst);
+  size_t chunk;
+
+  /* flush each stream and concatenate if necessary */
+  for (chunk = 0; chunk < chunks; chunk++) {
+    bitstream_size bits = stream_wtell(src[chunk]);
     offset += bits;
-    stream_flush(src[i]);
+    stream_flush(src[chunk]);
     /* concatenate streams if they are not already contiguous */
     if (copy) {
-      stream_rewind(src[i]);
-      stream_copy(dst, src[i], bits);
-      free(stream_data(src[i]));
+      stream_rewind(src[chunk]);
+      stream_copy(dst, src[chunk], bits);
+      free(stream_data(src[chunk]));
     }
-    stream_close(src[i]);
+    stream_close(src[chunk]);
   }
+
   free(src);
   if (!copy)
     stream_wseek(dst, offset);
diff --git a/src/template/codec.c b/src/template/codec.c
new file mode 100644
index 00000000..539bca98
--- /dev/null
+++ b/src/template/codec.c
@@ -0,0 +1,6 @@
+/* true if max compressed size exceeds maxbits */
+static int
+with_maxbits(uint maxbits, uint maxprec, uint size)
+{
+  return (maxprec + 1) * size - 1 > maxbits;
+}
diff --git a/src/template/codecf.c b/src/template/codecf.c
index bc2cc808..50929fa7 100644
--- a/src/template/codecf.c
+++ b/src/template/codecf.c
@@ -5,14 +5,18 @@
 static uint
 precision(int maxexp, uint maxprec, int minexp, int dims)
 {
-  return MIN(maxprec, (uint)MAX(0, maxexp - minexp + 2 * (dims + 1)));
+#if (ZFP_ROUNDING_MODE != ZFP_ROUND_NEVER) && defined(ZFP_WITH_TIGHT_ERROR)
+  return MIN(maxprec, (uint)MAX(0, maxexp - minexp + 2 * dims + 1));
+#else
+  return MIN(maxprec, (uint)MAX(0, maxexp - minexp + 2 * dims + 2));
+#endif
 }
 
 /* map integer x relative to exponent e to floating-point number */
 static Scalar
 _t1(dequantize, Scalar)(Int x, int e)
 {
-  return LDEXP((Scalar)x, e - (CHAR_BIT * (int)sizeof(Scalar) - 2));
+  return LDEXP((Scalar)x, e - ((int)(CHAR_BIT * sizeof(Scalar)) - 2));
 }
 
 /* inverse block-floating-point transform from signed integers */
diff --git a/src/template/compress.c b/src/template/compress.c
index 3bef658d..74983c56 100644
--- a/src/template/compress.c
+++ b/src/template/compress.c
@@ -3,9 +3,9 @@ static void
 _t2(compress, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
-  uint mx = nx & ~3u;
-  uint x;
+  size_t nx = field->nx;
+  size_t mx = nx & ~3u;
+  size_t x;
 
   /* compress array one block of 4 values at a time */
   for (x = 0; x < mx; x += 4, data += 4)
@@ -19,9 +19,9 @@ static void
 _t2(compress_strided, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
   const Scalar* data = field->data;
-  uint nx = field->nx;
-  int sx = field->sx ? field->sx : 1;
-  uint x;
+  size_t nx = field->nx;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  size_t x;
 
   /* compress array one block of 4 values at a time */
   for (x = 0; x < nx; x += 4) {
@@ -38,11 +38,11 @@ static void
 _t2(compress_strided, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
 {
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  uint x, y;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  size_t x, y;
 
   /* compress array one block of 4x4 values at a time */
   for (y = 0; y < ny; y += 4)
@@ -60,13 +60,13 @@ static void
 _t2(compress_strided, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
 {
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  uint nz = field->nz;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  int sz = field->sz ? field->sz : (int)(nx * ny);
-  uint x, y, z;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  size_t nz = field->nz;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
+  size_t x, y, z;
 
   /* compress array one block of 4x4x4 values at a time */
   for (z = 0; z < nz; z += 4)
@@ -85,15 +85,15 @@ static void
 _t2(compress_strided, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
 {
   const Scalar* data = field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  uint nz = field->nz;
-  uint nw = field->nw;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  int sz = field->sz ? field->sz : (int)(nx * ny);
-  int sw = field->sw ? field->sw : (int)(nx * ny * nz);
-  uint x, y, z, w;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  size_t nz = field->nz;
+  size_t nw = field->nw;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
+  ptrdiff_t sw = field->sw ? field->sw : (ptrdiff_t)(nx * ny * nz);
+  size_t x, y, z, w;
 
   /* compress array one block of 4x4x4x4 values at a time */
   for (w = 0; w < nw; w += 4)
diff --git a/src/template/cudacompress.c b/src/template/cudacompress.c
index 1d685c92..8249beb9 100644
--- a/src/template/cudacompress.c
+++ b/src/template/cudacompress.c
@@ -5,40 +5,32 @@
 static void 
 _t2(compress_cuda, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  { 
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_compress(stream, field);   
-  }
 }
 
 /* compress 1d strided array */
 static void 
 _t2(compress_strided_cuda, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_compress(stream, field);   
-  }
 }
 
 /* compress 2d strided array */
 static void 
 _t2(compress_strided_cuda, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_compress(stream, field);   
-  }
 }
 
 /* compress 3d strided array */
 static void
 _t2(compress_strided_cuda, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_compress(stream, field);   
-  }
 }
 
 #endif
diff --git a/src/template/cudadecompress.c b/src/template/cudadecompress.c
index 4ea4e5bf..1dc918a6 100644
--- a/src/template/cudadecompress.c
+++ b/src/template/cudadecompress.c
@@ -5,40 +5,32 @@
 static void
 _t2(decompress_cuda, Scalar, 1)(zfp_stream* stream, zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_decompress(stream, field);   
-  }
 }
 
 /* compress 1d strided array */
 static void
 _t2(decompress_strided_cuda, Scalar, 1)(zfp_stream* stream, zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_decompress(stream, field);   
-  }
 }
 
 /* compress 2d strided array */
 static void
 _t2(decompress_strided_cuda, Scalar, 2)(zfp_stream* stream, zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_decompress(stream, field);   
-  }
 }
 
 /* compress 3d strided array */
 static void
 _t2(decompress_strided_cuda, Scalar, 3)(zfp_stream* stream, zfp_field* field)
 {
-  if(zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
-  {
+  if (zfp_stream_compression_mode(stream) == zfp_mode_fixed_rate)
     cuda_decompress(stream, field);   
-  }
 }
 
 #endif
diff --git a/src/template/decode.c b/src/template/decode.c
index e2a2f276..2f39bebd 100644
--- a/src/template/decode.c
+++ b/src/template/decode.c
@@ -6,7 +6,7 @@ static void _t2(inv_xform, Int, DIMS)(Int* p);
 
 /* inverse lifting transform of 4-vector */
 static void
-_t1(inv_lift, Int)(Int* p, uint s)
+_t1(inv_lift, Int)(Int* p, ptrdiff_t s)
 {
   Int x, y, z, w;
   x = *p; p += s;
@@ -33,6 +33,21 @@ _t1(inv_lift, Int)(Int* p, uint s)
   p -= s; *p = x;
 }
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+/* bias values such that truncation is equivalent to round to nearest */
+static void
+_t1(inv_round, UInt)(UInt* ublock, uint n, uint m, uint prec)
+{
+  /* add 1/6 ulp to unbias errors */
+  if (prec < (uint)(CHAR_BIT * sizeof(UInt) - 1)) {
+    /* the first m values (0 <= m <= n) have one more bit of precision */
+    n -= m;
+    while (m--) *ublock++ += ((NBMASK >> 2) >> prec);
+    while (n--) *ublock++ += ((NBMASK >> 1) >> prec);
+  }
+}
+#endif
+
 /* map two's complement signed integer to negabinary unsigned integer */
 static Int
 _t1(uint2int, UInt)(UInt x)
@@ -49,13 +64,13 @@ _t1(inv_order, Int)(const UInt* ublock, Int* iblock, const uchar* perm, uint n)
   while (--n);
 }
 
-/* decompress sequence of size unsigned integers */
+/* decompress sequence of size <= 64 unsigned integers */
 static uint
-_t1(decode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size)
+_t1(decode_few_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size)
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint bits = maxbits;
   uint i, k, m, n;
@@ -66,20 +81,40 @@ _t1(decode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec,
     data[i] = 0;
 
   /* decode one bit plane at a time from MSB to LSB */
-  for (k = intprec, n = 0; bits && k-- > kmin;) {
-    /* decode first n bits of bit plane #k */
+  for (k = intprec, m = n = 0; bits && (m = 0, k-- > kmin);) {
+    /* step 1: decode first n bits of bit plane #k */
     m = MIN(n, bits);
     bits -= m;
     x = stream_read_bits(&s, m);
-    /* unary run-length decode remainder of bit plane */
-    for (; n < size && bits && (bits--, stream_read_bit(&s)); x += (uint64)1 << n++)
-      for (; n < size - 1 && bits && (bits--, !stream_read_bit(&s)); n++)
-        ;
-    /* deposit bit plane from x */
+    /* step 2: unary run-length decode remainder of bit plane */
+    for (; bits && n < size; n++, m = n) {
+      bits--;
+      if (stream_read_bit(&s)) {
+        /* positive group test; scan for next one-bit */
+        for (; bits && n < size - 1; n++) {
+          bits--;
+          if (stream_read_bit(&s))
+            break;
+        }
+        /* set bit and continue decoding bit plane */
+        x += (uint64)1 << n;
+      }
+      else {
+        /* negative group test; done with bit plane */
+        m = size;
+        break;
+      }
+    }
+    /* step 3: deposit bit plane from x */
     for (i = 0; x; i++, x >>= 1)
       data[i] += (UInt)(x & 1u) << k;
   }
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+  /* bias values to achieve proper rounding */
+  _t1(inv_round, UInt)(data, size, m, intprec - k);
+#endif
+
   *stream = s;
   return maxbits - bits;
 }
@@ -90,7 +125,7 @@ _t1(decode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxp
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint bits = maxbits;
   uint i, k, m, n;
@@ -100,34 +135,145 @@ _t1(decode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxp
     data[i] = 0;
 
   /* decode one bit plane at a time from MSB to LSB */
-  for (k = intprec, n = 0; bits && k-- > kmin;) {
-    /* decode first n bits of bit plane #k */
+  for (k = intprec, m = n = 0; bits && (m = 0, k-- > kmin);) {
+    /* step 1: decode first n bits of bit plane #k */
     m = MIN(n, bits);
     bits -= m;
     for (i = 0; i < m; i++)
       if (stream_read_bit(&s))
         data[i] += (UInt)1 << k;
-    /* unary run-length decode remainder of bit plane */
-    for (; n < size && bits && (--bits, stream_read_bit(&s)); data[n] += (UInt)1 << k, n++)
-      for (; n < size - 1 && bits && (--bits, !stream_read_bit(&s)); n++)
-        ;
+    /* step 2: unary run-length decode remainder of bit plane */
+    for (; bits && n < size; n++, m = n) {
+      bits--;
+      if (stream_read_bit(&s)) {
+        /* positive group test; scan for next one-bit */
+        for (; bits && n < size - 1; n++) {
+          bits--;
+          if (stream_read_bit(&s))
+            break;
+        }
+        /* set bit and continue decoding bit plane */
+        data[n] += (UInt)1 << k;
+      }
+      else {
+        /* negative group test; done with bit plane */
+        m = size;
+        break;
+      }
+    }
   }
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+  /* bias values to achieve proper rounding */
+  _t1(inv_round, UInt)(data, size, m, intprec - k);
+#endif
+
   *stream = s;
   return maxbits - bits;
 }
 
+/* decompress sequence of size <= 64 unsigned integers with no rate constraint */
+static uint
+_t1(decode_few_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  bitstream_offset offset = stream_rtell(&s);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint i, k, n;
+
+  /* initialize data array to all zeros */
+  for (i = 0; i < size; i++)
+    data[i] = 0;
+
+  /* decode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; k-- > kmin;) {
+    /* step 1: decode first n bits of bit plane #k */
+    uint64 x = stream_read_bits(&s, n);
+    /* step 2: unary run-length decode remainder of bit plane */
+    for (; n < size && stream_read_bit(&s); x += (uint64)1 << n, n++)
+      for (; n < size - 1 && !stream_read_bit(&s); n++)
+        ;
+    /* step 3: deposit bit plane from x */
+    for (i = 0; x; i++, x >>= 1)
+      data[i] += (UInt)(x & 1u) << k;
+  }
+
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+  /* bias values to achieve proper rounding */
+  _t1(inv_round, UInt)(data, size, 0, intprec - k);
+#endif
+
+  *stream = s;
+  return (uint)(stream_rtell(&s) - offset);
+}
+
+/* decompress sequence of size > 64 unsigned integers with no rate constraint */
+static uint
+_t1(decode_many_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  bitstream_offset offset = stream_rtell(&s);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint i, k, n;
+
+  /* initialize data array to all zeros */
+  for (i = 0; i < size; i++)
+    data[i] = 0;
+
+  /* decode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; k-- > kmin;) {
+    /* step 1: decode first n bits of bit plane #k */
+    for (i = 0; i < n; i++)
+      if (stream_read_bit(&s))
+        data[i] += (UInt)1 << k;
+    /* step 2: unary run-length decode remainder of bit plane */
+    for (; n < size && stream_read_bit(&s); data[n] += (UInt)1 << k, n++)
+      for (; n < size - 1 && !stream_read_bit(&s); n++)
+        ;
+  }
+
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_LAST
+  /* bias values to achieve proper rounding */
+  _t1(inv_round, UInt)(data, size, 0, intprec - k);
+#endif
+
+  *stream = s;
+  return (uint)(stream_rtell(&s) - offset);
+}
+
+/* decompress sequence of size unsigned integers */
+static uint
+_t1(decode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, UInt* restrict_ data, uint size)
+{
+  /* use fastest available decoder implementation */
+  if (with_maxbits(maxbits, maxprec, size)) {
+    /* rate constrained path: decode partial bit planes */
+    if (size <= 64)
+      return _t1(decode_few_ints, UInt)(stream, maxbits, maxprec, data, size); /* 1D, 2D, 3D blocks */
+    else
+      return _t1(decode_many_ints, UInt)(stream, maxbits, maxprec, data, size); /* 4D blocks */
+  }
+  else {
+    /* variable-rate path: decode whole bit planes */
+    if (size <= 64)
+      return _t1(decode_few_ints_prec, UInt)(stream, maxprec, data, size); /* 1D, 2D, 3D blocks */
+    else
+      return _t1(decode_many_ints_prec, UInt)(stream, maxprec, data, size); /* 4D blocks */
+  }
+}
+
 /* decode block of integers */
 static uint
-_t2(decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+_t2(decode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock)
 {
-  int bits;
+  uint bits;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* decode integer coefficients */
-  if (BLOCK_SIZE <= 64)
-    bits = _t1(decode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
-  else
-    bits = _t1(decode_many_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
+  bits = _t1(decode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
   /* read at least minbits bits */
   if (bits < minbits) {
     stream_skip(stream, minbits - bits);
diff --git a/src/template/decode1.c b/src/template/decode1.c
index 68ee0793..76444d86 100644
--- a/src/template/decode1.c
+++ b/src/template/decode1.c
@@ -2,7 +2,7 @@
 
 /* scatter 4-value block to strided array */
 static void
-_t2(scatter, Scalar, 1)(const Scalar* q, Scalar* p, int sx)
+_t2(scatter, Scalar, 1)(const Scalar* q, Scalar* p, ptrdiff_t sx)
 {
   uint x;
   for (x = 0; x < 4; x++, p += sx)
@@ -11,11 +11,11 @@ _t2(scatter, Scalar, 1)(const Scalar* q, Scalar* p, int sx)
 
 /* scatter nx-value block to strided array */
 static void
-_t2(scatter_partial, Scalar, 1)(const Scalar* q, Scalar* p, uint nx, int sx)
+_t2(scatter_partial, Scalar, 1)(const Scalar* q, Scalar* p, size_t nx, ptrdiff_t sx)
 {
-  uint x;
+  size_t x;
   for (x = 0; x < nx; x++, p += sx)
-   *p = *q++;
+    *p = *q++;
 }
 
 /* inverse decorrelating 1D transform */
@@ -28,26 +28,26 @@ _t2(inv_xform, Int, 1)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* decode 4-value floating-point block and store at p using stride sx */
-uint
-_t2(zfp_decode_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, int sx)
+/* decode 4-value block and store at p using stride sx */
+size_t
+_t2(zfp_decode_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, ptrdiff_t sx)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[4]);
-  uint bits = _t2(zfp_decode_block, Scalar, 1)(stream, fblock);
+  cache_align_(Scalar block[4]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 1)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter, Scalar, 1)(fblock, p, sx);
+  _t2(scatter, Scalar, 1)(block, p, sx);
   return bits;
 }
 
-/* decode nx-value floating-point block and store at p using stride sx */
-uint
-_t2(zfp_decode_partial_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, uint nx, int sx)
+/* decode nx-value block and store at p using stride sx */
+size_t
+_t2(zfp_decode_partial_block_strided, Scalar, 1)(zfp_stream* stream, Scalar* p, size_t nx, ptrdiff_t sx)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[4]);
-  uint bits = _t2(zfp_decode_block, Scalar, 1)(stream, fblock);
+  cache_align_(Scalar block[4]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 1)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter_partial, Scalar, 1)(fblock, p, nx, sx);
+  _t2(scatter_partial, Scalar, 1)(block, p, nx, sx);
   return bits;
 }
diff --git a/src/template/decode2.c b/src/template/decode2.c
index 23e1892c..4d3d5bc2 100644
--- a/src/template/decode2.c
+++ b/src/template/decode2.c
@@ -2,7 +2,7 @@
 
 /* scatter 4*4 block to strided array */
 static void
-_t2(scatter, Scalar, 2)(const Scalar* q, Scalar* p, int sx, int sy)
+_t2(scatter, Scalar, 2)(const Scalar* q, Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
 {
   uint x, y;
   for (y = 0; y < 4; y++, p += sy - 4 * sx)
@@ -12,9 +12,9 @@ _t2(scatter, Scalar, 2)(const Scalar* q, Scalar* p, int sx, int sy)
 
 /* scatter nx*ny block to strided array */
 static void
-_t2(scatter_partial, Scalar, 2)(const Scalar* q, Scalar* p, uint nx, uint ny, int sx, int sy)
+_t2(scatter_partial, Scalar, 2)(const Scalar* q, Scalar* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy)
 {
-  uint x, y;
+  size_t x, y;
   for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 4 - nx)
     for (x = 0; x < nx; x++, p += sx, q++)
       *p = *q;
@@ -35,26 +35,26 @@ _t2(inv_xform, Int, 2)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* decode 4*4 floating-point block and store at p using strides (sx, sy) */
-uint
-_t2(zfp_decode_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, int sx, int sy)
+/* decode 4*4 block and store at p using strides (sx, sy) */
+size_t
+_t2(zfp_decode_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[16]);
-  uint bits = _t2(zfp_decode_block, Scalar, 2)(stream, fblock);
+  cache_align_(Scalar block[16]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 2)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter, Scalar, 2)(fblock, p, sx, sy);
+  _t2(scatter, Scalar, 2)(block, p, sx, sy);
   return bits;
 }
 
-/* decode nx*ny floating-point block and store at p using strides (sx, sy) */
-uint
-_t2(zfp_decode_partial_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, uint nx, uint ny, int sx, int sy)
+/* decode nx*ny block and store at p using strides (sx, sy) */
+size_t
+_t2(zfp_decode_partial_block_strided, Scalar, 2)(zfp_stream* stream, Scalar* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[16]);
-  uint bits = _t2(zfp_decode_block, Scalar, 2)(stream, fblock);
+  cache_align_(Scalar block[16]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 2)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter_partial, Scalar, 2)(fblock, p, nx, ny, sx, sy);
+  _t2(scatter_partial, Scalar, 2)(block, p, nx, ny, sx, sy);
   return bits;
 }
diff --git a/src/template/decode3.c b/src/template/decode3.c
index b4841182..c9232fcd 100644
--- a/src/template/decode3.c
+++ b/src/template/decode3.c
@@ -2,7 +2,7 @@
 
 /* scatter 4*4*4 block to strided array */
 static void
-_t2(scatter, Scalar, 3)(const Scalar* q, Scalar* p, int sx, int sy, int sz)
+_t2(scatter, Scalar, 3)(const Scalar* q, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   uint x, y, z;
   for (z = 0; z < 4; z++, p += sz - 4 * sy)
@@ -13,9 +13,9 @@ _t2(scatter, Scalar, 3)(const Scalar* q, Scalar* p, int sx, int sy, int sz)
 
 /* scatter nx*ny*nz block to strided array */
 static void
-_t2(scatter_partial, Scalar, 3)(const Scalar* q, Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+_t2(scatter_partial, Scalar, 3)(const Scalar* q, Scalar* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
-  uint x, y, z;
+  size_t x, y, z;
   for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 4 * (4 - ny))
     for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 1 * (4 - nx))
       for (x = 0; x < nx; x++, p += sx, q++)
@@ -43,26 +43,26 @@ _t2(inv_xform, Int, 3)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* decode 4*4*4 floating-point block and store at p using strides (sx, sy, sz) */
-uint
-_t2(zfp_decode_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, int sx, int sy, int sz)
+/* decode 4*4*4 block and store at p using strides (sx, sy, sz) */
+size_t
+_t2(zfp_decode_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[64]);
-  uint bits = _t2(zfp_decode_block, Scalar, 3)(stream, fblock);
+  cache_align_(Scalar block[64]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 3)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter, Scalar, 3)(fblock, p, sx, sy, sz);
+  _t2(scatter, Scalar, 3)(block, p, sx, sy, sz);
   return bits;
 }
 
-/* decode nx*ny*nz floating-point block and store at p using strides (sx, sy, sz) */
-uint
-_t2(zfp_decode_partial_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+/* decode nx*ny*nz block and store at p using strides (sx, sy, sz) */
+size_t
+_t2(zfp_decode_partial_block_strided, Scalar, 3)(zfp_stream* stream, Scalar* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[64]);
-  uint bits = _t2(zfp_decode_block, Scalar, 3)(stream, fblock);
+  cache_align_(Scalar block[64]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 3)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter_partial, Scalar, 3)(fblock, p, nx, ny, nz, sx, sy, sz);
+  _t2(scatter_partial, Scalar, 3)(block, p, nx, ny, nz, sx, sy, sz);
   return bits;
 }
diff --git a/src/template/decode4.c b/src/template/decode4.c
index 8d34abfc..3274b429 100644
--- a/src/template/decode4.c
+++ b/src/template/decode4.c
@@ -2,7 +2,7 @@
 
 /* scatter 4*4*4*4 block to strided array */
 static void
-_t2(scatter, Scalar, 4)(const Scalar* q, Scalar* p, int sx, int sy, int sz, int sw)
+_t2(scatter, Scalar, 4)(const Scalar* q, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   uint x, y, z, w;
   for (w = 0; w < 4; w++, p += sw - 4 * sz)
@@ -14,9 +14,9 @@ _t2(scatter, Scalar, 4)(const Scalar* q, Scalar* p, int sx, int sy, int sz, int
 
 /* scatter nx*ny*nz*nw block to strided array */
 static void
-_t2(scatter_partial, Scalar, 4)(const Scalar* q, Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+_t2(scatter_partial, Scalar, 4)(const Scalar* q, Scalar* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
-  uint x, y, z, w;
+  size_t x, y, z, w;
   for (w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz, q += 16 * (4 - nz))
     for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy, q += 4 * (4 - ny))
       for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx, q += 1 * (4 - nx))
@@ -53,26 +53,26 @@ _t2(inv_xform, Int, 4)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* decode 4*4*4*4 floating-point block and store at p using strides (sx, sy, sz, sw) */
-uint
-_t2(zfp_decode_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, int sx, int sy, int sz, int sw)
+/* decode 4*4*4*4 block and store at p using strides (sx, sy, sz, sw) */
+size_t
+_t2(zfp_decode_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[256]);
-  uint bits = _t2(zfp_decode_block, Scalar, 4)(stream, fblock);
+  cache_align_(Scalar block[256]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 4)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter, Scalar, 4)(fblock, p, sx, sy, sz, sw);
+  _t2(scatter, Scalar, 4)(block, p, sx, sy, sz, sw);
   return bits;
 }
 
-/* decode nx*ny*nz*nw floating-point block and store at p using strides (sx, sy, sz, sw) */
-uint
-_t2(zfp_decode_partial_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+/* decode nx*ny*nz*nw block and store at p using strides (sx, sy, sz, sw) */
+size_t
+_t2(zfp_decode_partial_block_strided, Scalar, 4)(zfp_stream* stream, Scalar* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   /* decode contiguous block */
-  cache_align_(Scalar fblock[256]);
-  uint bits = _t2(zfp_decode_block, Scalar, 4)(stream, fblock);
+  cache_align_(Scalar block[256]);
+  size_t bits = _t2(zfp_decode_block, Scalar, 4)(stream, block);
   /* scatter block to strided array */
-  _t2(scatter_partial, Scalar, 4)(fblock, p, nx, ny, nz, nw, sx, sy, sz, sw);
+  _t2(scatter_partial, Scalar, 4)(block, p, nx, ny, nz, nw, sx, sy, sz, sw);
   return bits;
 }
diff --git a/src/template/decodef.c b/src/template/decodef.c
index 5df15638..e8fa40c8 100644
--- a/src/template/decodef.c
+++ b/src/template/decodef.c
@@ -10,12 +10,14 @@ _t2(decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
   /* test if block has nonzero values */
   if (stream_read_bit(zfp->stream)) {
     cache_align_(Int iblock[BLOCK_SIZE]);
+    uint maxprec;
+    int emax;
     /* decode common exponent */
     bits += EBITS;
-    int emax = (int)stream_read_bits(zfp->stream, EBITS) - EBIAS;
-    int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
+    emax = (int)stream_read_bits(zfp->stream, EBITS) - EBIAS;
+    maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
     /* decode integer block */
-    bits += _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, maxprec, iblock);
+    bits += _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, maxprec, iblock);
     /* perform inverse block-floating-point transform */
     _t1(inv_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
   }
@@ -35,7 +37,7 @@ _t2(decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
 /* public functions -------------------------------------------------------- */
 
 /* decode contiguous floating-point block */
-uint
+size_t
 _t2(zfp_decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
 {
   return REVERSIBLE(zfp) ? _t2(rev_decode_block, Scalar, DIMS)(zfp, fblock) : _t2(decode_block, Scalar, DIMS)(zfp, fblock);
diff --git a/src/template/decodei.c b/src/template/decodei.c
index 12f62a98..3cea9651 100644
--- a/src/template/decodei.c
+++ b/src/template/decodei.c
@@ -1,9 +1,9 @@
-static uint _t2(rev_decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, Int* iblock);
+static uint _t2(rev_decode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, Int* iblock);
 
 /* public functions -------------------------------------------------------- */
 
 /* decode contiguous integer block */
-uint
+size_t
 _t2(zfp_decode_block, Int, DIMS)(zfp_stream* zfp, Int* iblock)
 {
   return REVERSIBLE(zfp) ? _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits, zfp->maxbits, iblock) : _t2(decode_block, Int, DIMS)(zfp->stream, zfp->minbits, zfp->maxbits, zfp->maxprec, iblock);
diff --git a/src/template/decompress.c b/src/template/decompress.c
index 22313f81..72610773 100644
--- a/src/template/decompress.c
+++ b/src/template/decompress.c
@@ -3,9 +3,9 @@ static void
 _t2(decompress, Scalar, 1)(zfp_stream* stream, zfp_field* field)
 {
   Scalar* data = (Scalar*)field->data;
-  uint nx = field->nx;
-  uint mx = nx & ~3u;
-  uint x;
+  size_t nx = field->nx;
+  size_t mx = nx & ~3u;
+  size_t x;
 
   /* decompress array one block of 4 values at a time */
   for (x = 0; x < mx; x += 4, data += 4)
@@ -19,9 +19,9 @@ static void
 _t2(decompress_strided, Scalar, 1)(zfp_stream* stream, zfp_field* field)
 {
   Scalar* data = field->data;
-  uint nx = field->nx;
-  int sx = field->sx ? field->sx : 1;
-  uint x;
+  size_t nx = field->nx;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  size_t x;
 
   /* decompress array one block of 4 values at a time */
   for (x = 0; x < nx; x += 4) {
@@ -38,11 +38,11 @@ static void
 _t2(decompress_strided, Scalar, 2)(zfp_stream* stream, zfp_field* field)
 {
   Scalar* data = (Scalar*)field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  uint x, y;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  size_t x, y;
 
   /* decompress array one block of 4x4 values at a time */
   for (y = 0; y < ny; y += 4)
@@ -60,13 +60,13 @@ static void
 _t2(decompress_strided, Scalar, 3)(zfp_stream* stream, zfp_field* field)
 {
   Scalar* data = (Scalar*)field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  uint nz = field->nz;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  int sz = field->sz ? field->sz : (int)(nx * ny);
-  uint x, y, z;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  size_t nz = field->nz;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
+  size_t x, y, z;
 
   /* decompress array one block of 4x4x4 values at a time */
   for (z = 0; z < nz; z += 4)
@@ -85,15 +85,15 @@ static void
 _t2(decompress_strided, Scalar, 4)(zfp_stream* stream, zfp_field* field)
 {
   Scalar* data = field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  uint nz = field->nz;
-  uint nw = field->nw;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  int sz = field->sz ? field->sz : (int)(nx * ny);
-  int sw = field->sw ? field->sw : (int)(nx * ny * nz);
-  uint x, y, z, w;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  size_t nz = field->nz;
+  size_t nw = field->nw;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
+  ptrdiff_t sw = field->sw ? field->sw : (ptrdiff_t)(nx * ny * nz);
+  size_t x, y, z, w;
 
   /* decompress array one block of 4x4x4x4 values at a time */
   for (w = 0; w < nw; w += 4)
diff --git a/src/template/encode.c b/src/template/encode.c
index bba18f60..c085a4ab 100644
--- a/src/template/encode.c
+++ b/src/template/encode.c
@@ -6,7 +6,7 @@ static void _t2(fwd_xform, Int, DIMS)(Int* p);
 
 /* pad partial block of width n <= 4 and stride s */
 static void
-_t1(pad_block, Scalar)(Scalar* p, uint n, uint s)
+_t1(pad_block, Scalar)(Scalar* p, size_t n, ptrdiff_t s)
 {
   switch (n) {
     case 0:
@@ -28,7 +28,7 @@ _t1(pad_block, Scalar)(Scalar* p, uint n, uint s)
 
 /* forward lifting transform of 4-vector */
 static void
-_t1(fwd_lift, Int)(Int* p, uint s)
+_t1(fwd_lift, Int)(Int* p, ptrdiff_t s)
 {
   Int x, y, z, w;
   x = *p; p += s;
@@ -55,6 +55,22 @@ _t1(fwd_lift, Int)(Int* p, uint s)
   p -= s; *p = x;
 }
 
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_FIRST
+/* bias values such that truncation is equivalent to round to nearest */
+static void
+_t1(fwd_round, Int)(Int* iblock, uint n, uint maxprec)
+{
+  /* add or subtract 1/6 ulp to unbias errors */
+  if (maxprec < (uint)(CHAR_BIT * sizeof(Int))) {
+    Int bias = (NBMASK >> 2) >> maxprec;
+    if (maxprec & 1u)
+      do *iblock++ += bias; while (--n);
+    else
+      do *iblock++ -= bias; while (--n);
+  }
+}
+#endif
+
 /* map two's complement signed integer to negabinary unsigned integer */
 static UInt
 _t1(int2uint, Int)(Int x)
@@ -71,13 +87,13 @@ _t1(fwd_order, Int)(UInt* ublock, const Int* iblock, const uchar* perm, uint n)
   while (--n);
 }
 
-/* compress sequence of size unsigned integers */
+/* compress sequence of size <= 64 unsigned integers */
 static uint
-_t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size)
+_t1(encode_few_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size)
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint bits = maxbits;
   uint i, k, m, n;
@@ -94,9 +110,21 @@ _t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec,
     bits -= m;
     x = stream_write_bits(&s, x, m);
     /* step 3: unary run-length encode remainder of bit plane */
-    for (; n < size && bits && (bits--, stream_write_bit(&s, !!x)); x >>= 1, n++)
-      for (; n < size - 1 && bits && (bits--, !stream_write_bit(&s, x & 1u)); x >>= 1, n++)
-        ;
+    for (; bits && n < size; x >>= 1, n++) {
+      bits--;
+      if (stream_write_bit(&s, !!x)) {
+        /* positive group test (x != 0); scan for one-bit */
+        for (; bits && n < size - 1; x >>= 1, n++) {
+          bits--;
+          if (stream_write_bit(&s, x & 1u))
+            break;
+        }
+      }
+      else {
+        /* negative group test (x == 0); done with bit plane */
+        break;
+      }
+    }
   }
 
   *stream = s;
@@ -109,7 +137,7 @@ _t1(encode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxp
 {
   /* make a copy of bit stream to avoid aliasing */
   bitstream s = *stream;
-  uint intprec = CHAR_BIT * (uint)sizeof(UInt);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
   uint kmin = intprec > maxprec ? intprec - maxprec : 0;
   uint bits = maxbits;
   uint i, k, m, n, c;
@@ -126,30 +154,123 @@ _t1(encode_many_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxp
     for (i = m; i < size; i++)
       c += (data[i] >> k) & 1u;
     /* step 3: unary run-length encode remainder of bit plane */
-    for (; n < size && bits && (--bits, stream_write_bit(&s, !!c)); c--, n++)
-      for (; n < size - 1 && bits && (--bits, !stream_write_bit(&s, (data[n] >> k) & 1u)); n++)
-        ;
+    for (; bits && n < size; n++) {
+      bits--;
+      if (stream_write_bit(&s, !!c)) {
+        /* positive group test (c > 0); scan for one-bit */
+        for (c--; bits && n < size - 1; n++) {
+          bits--;
+          if (stream_write_bit(&s, (data[n] >> k) & 1u))
+            break;
+        }
+      }
+      else {
+        /* negative group test (c == 0); done with bit plane */
+        break;
+      }
+    }
   }
 
   *stream = s;
   return maxbits - bits;
 }
 
+/* compress sequence of size <= 64 unsigned integers with no rate constraint */
+static uint
+_t1(encode_few_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, const UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  bitstream_offset offset = stream_wtell(&s);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint i, k, n;
+
+  /* encode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; k-- > kmin;) {
+    /* step 1: extract bit plane #k to x */
+    uint64 x = 0;
+    for (i = 0; i < size; i++)
+      x += (uint64)((data[i] >> k) & 1u) << i;
+    /* step 2: encode first n bits of bit plane */
+    x = stream_write_bits(&s, x, n);
+    /* step 3: unary run-length encode remainder of bit plane */
+    for (; n < size && stream_write_bit(&s, !!x); x >>= 1, n++)
+      for (; n < size - 1 && !stream_write_bit(&s, x & 1u); x >>= 1, n++)
+        ;
+  }
+
+  *stream = s;
+  return (uint)(stream_wtell(&s) - offset);
+}
+
+/* compress sequence of size > 64 unsigned integers with no rate constraint */
+static uint
+_t1(encode_many_ints_prec, UInt)(bitstream* restrict_ stream, uint maxprec, const UInt* restrict_ data, uint size)
+{
+  /* make a copy of bit stream to avoid aliasing */
+  bitstream s = *stream;
+  bitstream_offset offset = stream_wtell(&s);
+  uint intprec = (uint)(CHAR_BIT * sizeof(UInt));
+  uint kmin = intprec > maxprec ? intprec - maxprec : 0;
+  uint i, k, n, c;
+
+  /* encode one bit plane at a time from MSB to LSB */
+  for (k = intprec, n = 0; k-- > kmin;) {
+    /* step 1: encode first n bits of bit plane #k */
+    for (i = 0; i < n; i++)
+      stream_write_bit(&s, (data[i] >> k) & 1u);
+    /* step 2: count remaining one-bits in bit plane */
+    c = 0;
+    for (i = n; i < size; i++)
+      c += (data[i] >> k) & 1u;
+    /* step 3: unary run-length encode remainder of bit plane */
+    for (; n < size && stream_write_bit(&s, !!c); n++)
+      for (c--; n < size - 1 && !stream_write_bit(&s, (data[n] >> k) & 1u); n++)
+        ;
+  }
+
+  *stream = s;
+  return (uint)(stream_wtell(&s) - offset);
+}
+
+/* compress sequence of size unsigned integers */
+static uint
+_t1(encode_ints, UInt)(bitstream* restrict_ stream, uint maxbits, uint maxprec, const UInt* restrict_ data, uint size)
+{
+  /* use fastest available encoder implementation */
+  if (with_maxbits(maxbits, maxprec, size)) {
+    /* rate constrained path: encode partial bit planes */
+    if (size <= 64)
+      return _t1(encode_few_ints, UInt)(stream, maxbits, maxprec, data, size); /* 1D, 2D, 3D blocks */
+    else
+      return _t1(encode_many_ints, UInt)(stream, maxbits, maxprec, data, size); /* 4D blocks */
+  }
+  else {
+    /* variable-rate path: encode whole bit planes */
+    if (size <= 64)
+      return _t1(encode_few_ints_prec, UInt)(stream, maxprec, data, size); /* 1D, 2D, 3D blocks */
+    else
+      return _t1(encode_many_ints_prec, UInt)(stream, maxprec, data, size); /* 4D blocks */
+  }
+}
+
 /* encode block of integers */
 static uint
-_t2(encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+_t2(encode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock)
 {
-  int bits;
+  uint bits;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* perform decorrelating transform */
   _t2(fwd_xform, Int, DIMS)(iblock);
+#if ZFP_ROUNDING_MODE == ZFP_ROUND_FIRST
+  /* bias values to achieve proper rounding */
+  _t1(fwd_round, Int)(iblock, BLOCK_SIZE, maxprec);
+#endif
   /* reorder signed coefficients and convert to unsigned integer */
   _t1(fwd_order, Int)(ublock, iblock, PERM, BLOCK_SIZE);
   /* encode integer coefficients */
-  if (BLOCK_SIZE <= 64)
-    bits = _t1(encode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
-  else
-    bits = _t1(encode_many_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
+  bits = _t1(encode_ints, UInt)(stream, maxbits, maxprec, ublock, BLOCK_SIZE);
   /* write at least minbits bits by padding with zeros */
   if (bits < minbits) {
     stream_pad(stream, minbits - bits);
diff --git a/src/template/encode1.c b/src/template/encode1.c
index c6184929..ff9d5c04 100644
--- a/src/template/encode1.c
+++ b/src/template/encode1.c
@@ -2,7 +2,7 @@
 
 /* gather 4-value block from strided array */
 static void
-_t2(gather, Scalar, 1)(Scalar* q, const Scalar* p, int sx)
+_t2(gather, Scalar, 1)(Scalar* q, const Scalar* p, ptrdiff_t sx)
 {
   uint x;
   for (x = 0; x < 4; x++, p += sx)
@@ -11,9 +11,9 @@ _t2(gather, Scalar, 1)(Scalar* q, const Scalar* p, int sx)
 
 /* gather nx-value block from strided array */
 static void
-_t2(gather_partial, Scalar, 1)(Scalar* q, const Scalar* p, uint nx, int sx)
+_t2(gather_partial, Scalar, 1)(Scalar* q, const Scalar* p, size_t nx, ptrdiff_t sx)
 {
-  uint x;
+  size_t x;
   for (x = 0; x < nx; x++, p += sx)
     q[x] = *p;
   _t1(pad_block, Scalar)(q, nx, 1);
@@ -29,24 +29,24 @@ _t2(fwd_xform, Int, 1)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* encode 4-value floating-point block stored at p using stride sx */
-uint
-_t2(zfp_encode_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, int sx)
+/* encode 4-value block stored at p using stride sx */
+size_t
+_t2(zfp_encode_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, ptrdiff_t sx)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[4]);
-  _t2(gather, Scalar, 1)(fblock, p, sx);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 1)(stream, fblock);
+  cache_align_(Scalar block[4]);
+  _t2(gather, Scalar, 1)(block, p, sx);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 1)(stream, block);
 }
 
-/* encode nx-value floating-point block stored at p using stride sx */
-uint
-_t2(zfp_encode_partial_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, uint nx, int sx)
+/* encode nx-value block stored at p using stride sx */
+size_t
+_t2(zfp_encode_partial_block_strided, Scalar, 1)(zfp_stream* stream, const Scalar* p, size_t nx, ptrdiff_t sx)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[4]);
-  _t2(gather_partial, Scalar, 1)(fblock, p, nx, sx);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 1)(stream, fblock);
+  cache_align_(Scalar block[4]);
+  _t2(gather_partial, Scalar, 1)(block, p, nx, sx);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 1)(stream, block);
 }
diff --git a/src/template/encode2.c b/src/template/encode2.c
index 4bec256a..b77b4394 100644
--- a/src/template/encode2.c
+++ b/src/template/encode2.c
@@ -2,7 +2,7 @@
 
 /* gather 4*4 block from strided array */
 static void
-_t2(gather, Scalar, 2)(Scalar* q, const Scalar* p, int sx, int sy)
+_t2(gather, Scalar, 2)(Scalar* q, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
 {
   uint x, y;
   for (y = 0; y < 4; y++, p += sy - 4 * sx)
@@ -12,9 +12,9 @@ _t2(gather, Scalar, 2)(Scalar* q, const Scalar* p, int sx, int sy)
 
 /* gather nx*ny block from strided array */
 static void
-_t2(gather_partial, Scalar, 2)(Scalar* q, const Scalar* p, uint nx, uint ny, int sx, int sy)
+_t2(gather_partial, Scalar, 2)(Scalar* q, const Scalar* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy)
 {
-  uint x, y;
+  size_t x, y;
   for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) {
     for (x = 0; x < nx; x++, p += sx)
       q[4 * y + x] = *p;
@@ -39,24 +39,24 @@ _t2(fwd_xform, Int, 2)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* encode 4*4 floating-point block stored at p using strides (sx, sy) */
-uint
-_t2(zfp_encode_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, int sx, int sy)
+/* encode 4*4 block stored at p using strides (sx, sy) */
+size_t
+_t2(zfp_encode_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[16]);
-  _t2(gather, Scalar, 2)(fblock, p, sx, sy);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 2)(stream, fblock);
+  cache_align_(Scalar block[16]);
+  _t2(gather, Scalar, 2)(block, p, sx, sy);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 2)(stream, block);
 }
 
-/* encode nx*ny floating-point block stored at p using strides (sx, sy) */
-uint
-_t2(zfp_encode_partial_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, int sx, int sy)
+/* encode nx*ny block stored at p using strides (sx, sy) */
+size_t
+_t2(zfp_encode_partial_block_strided, Scalar, 2)(zfp_stream* stream, const Scalar* p, size_t nx, size_t ny, ptrdiff_t sx, ptrdiff_t sy)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[16]);
-  _t2(gather_partial, Scalar, 2)(fblock, p, nx, ny, sx, sy);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 2)(stream, fblock);
+  cache_align_(Scalar block[16]);
+  _t2(gather_partial, Scalar, 2)(block, p, nx, ny, sx, sy);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 2)(stream, block);
 }
diff --git a/src/template/encode3.c b/src/template/encode3.c
index a16a8add..3206060d 100644
--- a/src/template/encode3.c
+++ b/src/template/encode3.c
@@ -2,7 +2,7 @@
 
 /* gather 4*4*4 block from strided array */
 static void
-_t2(gather, Scalar, 3)(Scalar* q, const Scalar* p, int sx, int sy, int sz)
+_t2(gather, Scalar, 3)(Scalar* q, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   uint x, y, z;
   for (z = 0; z < 4; z++, p += sz - 4 * sy)
@@ -13,9 +13,9 @@ _t2(gather, Scalar, 3)(Scalar* q, const Scalar* p, int sx, int sy, int sz)
 
 /* gather nx*ny*nz block from strided array */
 static void
-_t2(gather_partial, Scalar, 3)(Scalar* q, const Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+_t2(gather_partial, Scalar, 3)(Scalar* q, const Scalar* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
-  uint x, y, z;
+  size_t x, y, z;
   for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy) {
     for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) {
       for (x = 0; x < nx; x++, p += sx)
@@ -51,24 +51,24 @@ _t2(fwd_xform, Int, 3)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* encode 4*4*4 floating-point block stored at p using strides (sx, sy, sz) */
-uint
-_t2(zfp_encode_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, int sx, int sy, int sz)
+/* encode 4*4*4 block stored at p using strides (sx, sy, sz) */
+size_t
+_t2(zfp_encode_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[64]);
-  _t2(gather, Scalar, 3)(fblock, p, sx, sy, sz);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 3)(stream, fblock);
+  cache_align_(Scalar block[64]);
+  _t2(gather, Scalar, 3)(block, p, sx, sy, sz);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 3)(stream, block);
 }
 
-/* encode nx*ny*nz floating-point block stored at p using strides (sx, sy, sz) */
-uint
-_t2(zfp_encode_partial_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, uint nz, int sx, int sy, int sz)
+/* encode nx*ny*nz block stored at p using strides (sx, sy, sz) */
+size_t
+_t2(zfp_encode_partial_block_strided, Scalar, 3)(zfp_stream* stream, const Scalar* p, size_t nx, size_t ny, size_t nz, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[64]);
-  _t2(gather_partial, Scalar, 3)(fblock, p, nx, ny, nz, sx, sy, sz);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 3)(stream, fblock);
+  cache_align_(Scalar block[64]);
+  _t2(gather_partial, Scalar, 3)(block, p, nx, ny, nz, sx, sy, sz);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 3)(stream, block);
 }
diff --git a/src/template/encode4.c b/src/template/encode4.c
index c9ed5425..90ca40a8 100644
--- a/src/template/encode4.c
+++ b/src/template/encode4.c
@@ -2,7 +2,7 @@
 
 /* gather 4*4*4*4 block from strided array */
 static void
-_t2(gather, Scalar, 4)(Scalar* q, const Scalar* p, int sx, int sy, int sz, int sw)
+_t2(gather, Scalar, 4)(Scalar* q, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   uint x, y, z, w;
   for (w = 0; w < 4; w++, p += sw - 4 * sz)
@@ -14,9 +14,9 @@ _t2(gather, Scalar, 4)(Scalar* q, const Scalar* p, int sx, int sy, int sz, int s
 
 /* gather nx*ny*nz*nw block from strided array */
 static void
-_t2(gather_partial, Scalar, 4)(Scalar* q, const Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+_t2(gather_partial, Scalar, 4)(Scalar* q, const Scalar* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
-  uint x, y, z, w;
+  size_t x, y, z, w;
   for (w = 0; w < nw; w++, p += sw - (ptrdiff_t)nz * sz) {
     for (z = 0; z < nz; z++, p += sz - (ptrdiff_t)ny * sy) {
       for (y = 0; y < ny; y++, p += sy - (ptrdiff_t)nx * sx) {
@@ -66,24 +66,24 @@ _t2(fwd_xform, Int, 4)(Int* p)
 
 /* public functions -------------------------------------------------------- */
 
-/* encode 4*4*4*4 floating-point block stored at p using strides (sx, sy, sz, sw) */
-uint
-_t2(zfp_encode_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, int sx, int sy, int sz, int sw)
+/* encode 4*4*4*4 block stored at p using strides (sx, sy, sz, sw) */
+size_t
+_t2(zfp_encode_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[256]);
-  _t2(gather, Scalar, 4)(fblock, p, sx, sy, sz, sw);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 4)(stream, fblock);
+  cache_align_(Scalar block[256]);
+  _t2(gather, Scalar, 4)(block, p, sx, sy, sz, sw);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 4)(stream, block);
 }
 
-/* encode nx*ny*nz*nw floating-point block stored at p using strides (sx, sy, sz, sw) */
-uint
-_t2(zfp_encode_partial_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, uint nx, uint ny, uint nz, uint nw, int sx, int sy, int sz, int sw)
+/* encode nx*ny*nz*nw block stored at p using strides (sx, sy, sz, sw) */
+size_t
+_t2(zfp_encode_partial_block_strided, Scalar, 4)(zfp_stream* stream, const Scalar* p, size_t nx, size_t ny, size_t nz, size_t nw, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   /* gather block from strided array */
-  cache_align_(Scalar fblock[256]);
-  _t2(gather_partial, Scalar, 4)(fblock, p, nx, ny, nz, nw, sx, sy, sz, sw);
-  /* encode floating-point block */
-  return _t2(zfp_encode_block, Scalar, 4)(stream, fblock);
+  cache_align_(Scalar block[256]);
+  _t2(gather_partial, Scalar, 4)(block, p, nx, ny, nz, nw, sx, sy, sz, sw);
+  /* encode block */
+  return _t2(zfp_encode_block, Scalar, 4)(stream, block);
 }
diff --git a/src/template/encodef.c b/src/template/encodef.c
index 5c6ec537..10e50438 100644
--- a/src/template/encodef.c
+++ b/src/template/encodef.c
@@ -1,3 +1,4 @@
+#include <float.h>
 #include <limits.h>
 #include <math.h>
 
@@ -9,13 +10,20 @@ static uint _t2(rev_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* f
 static int
 _t1(exponent, Scalar)(Scalar x)
 {
+  /* use e = -EBIAS when x = 0 */
+  int e = -EBIAS;
+#ifdef ZFP_WITH_DAZ
+  /* treat subnormals as zero; resolves issue #119 by avoiding overflow */
+  if (x >= SCALAR_MIN)
+    FREXP(x, &e);
+#else
   if (x > 0) {
-    int e;
     FREXP(x, &e);
-    /* clamp exponent in case x is denormal */
-    return MAX(e, 1 - EBIAS);
+    /* clamp exponent in case x is subnormal; may still result in overflow */
+    e = MAX(e, 1 - EBIAS);
   }
-  return -EBIAS;
+#endif
+  return e;
 }
 
 /* compute maximum floating-point exponent in block of n values */
@@ -35,7 +43,7 @@ _t1(exponent_block, Scalar)(const Scalar* p, uint n)
 static Scalar
 _t1(quantize, Scalar)(Scalar x, int e)
 {
-  return LDEXP(x, (CHAR_BIT * (int)sizeof(Scalar) - 2) - e);
+  return LDEXP(x, ((int)(CHAR_BIT * sizeof(Scalar)) - 2) - e);
 }
 
 /* forward block-floating-point transform to signed integers */
@@ -57,8 +65,8 @@ _t2(encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
   uint bits = 1;
   /* compute maximum exponent */
   int emax = _t1(exponent_block, Scalar)(fblock, BLOCK_SIZE);
-  int maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
-  uint e = maxprec ? emax + EBIAS : 0;
+  uint maxprec = precision(emax, zfp->maxprec, zfp->minexp, DIMS);
+  uint e = maxprec ? (uint)(emax + EBIAS) : 0;
   /* encode block only if biased exponent is nonzero */
   if (e) {
     cache_align_(Int iblock[BLOCK_SIZE]);
@@ -68,7 +76,7 @@ _t2(encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
     /* perform forward block-floating-point transform */
     _t1(fwd_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
     /* encode integer block */
-    bits += _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, maxprec, iblock);
+    bits += _t2(encode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, maxprec, iblock);
   }
   else {
     /* write single zero-bit to indicate that all values are zero */
@@ -84,7 +92,7 @@ _t2(encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
 /* public functions -------------------------------------------------------- */
 
 /* encode contiguous floating-point block */
-uint
+size_t
 _t2(zfp_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
 {
   return REVERSIBLE(zfp) ? _t2(rev_encode_block, Scalar, DIMS)(zfp, fblock) : _t2(encode_block, Scalar, DIMS)(zfp, fblock);
diff --git a/src/template/encodei.c b/src/template/encodei.c
index 41d5fbd6..2aa4e7e3 100644
--- a/src/template/encodei.c
+++ b/src/template/encodei.c
@@ -1,9 +1,9 @@
-static uint _t2(rev_encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock);
+static uint _t2(rev_encode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock);
 
 /* public functions -------------------------------------------------------- */
 
 /* encode contiguous integer block */
-uint
+size_t
 _t2(zfp_encode_block, Int, DIMS)(zfp_stream* zfp, const Int* iblock)
 {
   cache_align_(Int block[BLOCK_SIZE]);
diff --git a/src/template/ompcompress.c b/src/template/ompcompress.c
index b0f86d23..4e4365c7 100644
--- a/src/template/ompcompress.c
+++ b/src/template/ompcompress.c
@@ -6,12 +6,13 @@ _t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
+  size_t nx = field->nx;
 
   /* number of omp threads, blocks, and chunks */
   uint threads = thread_count_omp(stream);
-  uint blocks = (nx + 3) / 4;
-  uint chunks = chunk_count_omp(stream, blocks, threads);
+  size_t blocks = (nx + 3) / 4;
+  size_t chunks = chunk_count_omp(stream, blocks, threads);
+  int chunk; /* OpenMP 2.0 requires int loop counter */
 
   /* allocate per-thread streams */
   bitstream** bs = compress_init_par(stream, field, chunks, blocks);
@@ -19,13 +20,12 @@ _t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
     return;
 
   /* compress chunks of blocks in parallel */
-  int chunk;
   #pragma omp parallel for num_threads(threads)
   for (chunk = 0; chunk < (int)chunks; chunk++) {
     /* determine range of block indices assigned to this thread */
-    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
-    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
-    uint block;
+    size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
+    size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
+    size_t block;
     /* set up thread-local bit stream */
     zfp_stream s = *stream;
     zfp_stream_set_bit_stream(&s, bs[chunk]);
@@ -33,11 +33,11 @@ _t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
     for (block = bmin; block < bmax; block++) {
       /* determine block origin x within array */
       const Scalar* p = data;
-      uint x = 4 * block;
+      size_t x = 4 * block;
       p += x;
       /* compress partial or full block */
-      if (nx - x < 4)
-        _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, MIN(nx - x, 4u), 1);
+      if (nx - x < 4u)
+        _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, nx - x, 1);
       else
         _t2(zfp_encode_block, Scalar, 1)(&s, p);
     }
@@ -53,13 +53,14 @@ _t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
-  int sx = field->sx ? field->sx : 1;
+  size_t nx = field->nx;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
 
   /* number of omp threads, blocks, and chunks */
   uint threads = thread_count_omp(stream);
-  uint blocks = (nx + 3) / 4;
-  uint chunks = chunk_count_omp(stream, blocks, threads);
+  size_t blocks = (nx + 3) / 4;
+  size_t chunks = chunk_count_omp(stream, blocks, threads);
+  int chunk; /* OpenMP 2.0 requires int loop counter */
 
   /* allocate per-thread streams */
   bitstream** bs = compress_init_par(stream, field, chunks, blocks);
@@ -67,13 +68,12 @@ _t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
     return;
 
   /* compress chunks of blocks in parallel */
-  int chunk;
   #pragma omp parallel for num_threads(threads)
   for (chunk = 0; chunk < (int)chunks; chunk++) {
     /* determine range of block indices assigned to this thread */
-    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
-    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
-    uint block;
+    size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
+    size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
+    size_t block;
     /* set up thread-local bit stream */
     zfp_stream s = *stream;
     zfp_stream_set_bit_stream(&s, bs[chunk]);
@@ -81,11 +81,11 @@ _t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
     for (block = bmin; block < bmax; block++) {
       /* determine block origin x within array */
       const Scalar* p = data;
-      uint x = 4 * block;
+      size_t x = 4 * block;
       p += sx * (ptrdiff_t)x;
       /* compress partial or full block */
-      if (nx - x < 4)
-        _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, MIN(nx - x, 4u), sx);
+      if (nx - x < 4u)
+        _t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, nx - x, sx);
       else
         _t2(zfp_encode_block_strided, Scalar, 1)(&s, p, sx);
     }
@@ -101,17 +101,18 @@ _t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
 
   /* number of omp threads, blocks, and chunks */
   uint threads = thread_count_omp(stream);
-  uint bx = (nx + 3) / 4;
-  uint by = (ny + 3) / 4;
-  uint blocks = bx * by;
-  uint chunks = chunk_count_omp(stream, blocks, threads);
+  size_t bx = (nx + 3) / 4;
+  size_t by = (ny + 3) / 4;
+  size_t blocks = bx * by;
+  size_t chunks = chunk_count_omp(stream, blocks, threads);
+  int chunk; /* OpenMP 2.0 requires int loop counter */
 
   /* allocate per-thread streams */
   bitstream** bs = compress_init_par(stream, field, chunks, blocks);
@@ -119,13 +120,12 @@ _t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
     return;
 
   /* compress chunks of blocks in parallel */
-  int chunk;
   #pragma omp parallel for num_threads(threads)
   for (chunk = 0; chunk < (int)chunks; chunk++) {
     /* determine range of block indices assigned to this thread */
-    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
-    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
-    uint block;
+    size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
+    size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
+    size_t block;
     /* set up thread-local bit stream */
     zfp_stream s = *stream;
     zfp_stream_set_bit_stream(&s, bs[chunk]);
@@ -133,13 +133,13 @@ _t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
     for (block = bmin; block < bmax; block++) {
       /* determine block origin (x, y) within array */
       const Scalar* p = data;
-      uint b = block;
-      uint x, y;
+      size_t b = block;
+      size_t x, y;
       x = 4 * (b % bx); b /= bx;
       y = 4 * b;
       p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y;
       /* compress partial or full block */
-      if (nx - x < 4 || ny - y < 4)
+      if (nx - x < 4u || ny - y < 4u)
         _t2(zfp_encode_partial_block_strided, Scalar, 2)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy);
       else
         _t2(zfp_encode_block_strided, Scalar, 2)(&s, p, sx, sy);
@@ -156,20 +156,21 @@ _t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
   const Scalar* data = (const Scalar*)field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  uint nz = field->nz;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  int sz = field->sz ? field->sz : (int)(nx * ny);
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  size_t nz = field->nz;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
 
   /* number of omp threads, blocks, and chunks */
   uint threads = thread_count_omp(stream);
-  uint bx = (nx + 3) / 4;
-  uint by = (ny + 3) / 4;
-  uint bz = (nz + 3) / 4;
-  uint blocks = bx * by * bz;
-  uint chunks = chunk_count_omp(stream, blocks, threads);
+  size_t bx = (nx + 3) / 4;
+  size_t by = (ny + 3) / 4;
+  size_t bz = (nz + 3) / 4;
+  size_t blocks = bx * by * bz;
+  size_t chunks = chunk_count_omp(stream, blocks, threads);
+  int chunk; /* OpenMP 2.0 requires int loop counter */
 
   /* allocate per-thread streams */
   bitstream** bs = compress_init_par(stream, field, chunks, blocks);
@@ -177,13 +178,12 @@ _t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
     return;
 
   /* compress chunks of blocks in parallel */
-  int chunk;
   #pragma omp parallel for num_threads(threads)
   for (chunk = 0; chunk < (int)chunks; chunk++) {
     /* determine range of block indices assigned to this thread */
-    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
-    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
-    uint block;
+    size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
+    size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
+    size_t block;
     /* set up thread-local bit stream */
     zfp_stream s = *stream;
     zfp_stream_set_bit_stream(&s, bs[chunk]);
@@ -191,14 +191,14 @@ _t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
     for (block = bmin; block < bmax; block++) {
       /* determine block origin (x, y, z) within array */
       const Scalar* p = data;
-      uint b = block;
-      uint x, y, z;
+      size_t b = block;
+      size_t x, y, z;
       x = 4 * (b % bx); b /= bx;
       y = 4 * (b % by); b /= by;
       z = 4 * b;
       p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z;
       /* compress partial or full block */
-      if (nx - x < 4 || ny - y < 4 || nz - z < 4)
+      if (nx - x < 4u || ny - y < 4u || nz - z < 4u)
         _t2(zfp_encode_partial_block_strided, Scalar, 3)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz);
       else
         _t2(zfp_encode_block_strided, Scalar, 3)(&s, p, sx, sy, sz);
@@ -215,23 +215,24 @@ _t2(compress_strided_omp, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
 {
   /* array metadata */
   const Scalar* data = field->data;
-  uint nx = field->nx;
-  uint ny = field->ny;
-  uint nz = field->nz;
-  uint nw = field->nw;
-  int sx = field->sx ? field->sx : 1;
-  int sy = field->sy ? field->sy : (int)nx;
-  int sz = field->sz ? field->sz : (int)(nx * ny);
-  int sw = field->sw ? field->sw : (int)(nx * ny * nz);
+  size_t nx = field->nx;
+  size_t ny = field->ny;
+  size_t nz = field->nz;
+  size_t nw = field->nw;
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
+  ptrdiff_t sw = field->sw ? field->sw : (ptrdiff_t)(nx * ny * nz);
 
   /* number of omp threads, blocks, and chunks */
   uint threads = thread_count_omp(stream);
-  uint bx = (nx + 3) / 4;
-  uint by = (ny + 3) / 4;
-  uint bz = (nz + 3) / 4;
-  uint bw = (nw + 3) / 4;
-  uint blocks = bx * by * bz * bw;
-  uint chunks = chunk_count_omp(stream, blocks, threads);
+  size_t bx = (nx + 3) / 4;
+  size_t by = (ny + 3) / 4;
+  size_t bz = (nz + 3) / 4;
+  size_t bw = (nw + 3) / 4;
+  size_t blocks = bx * by * bz * bw;
+  size_t chunks = chunk_count_omp(stream, blocks, threads);
+  int chunk; /* OpenMP 2.0 requires int loop counter */
 
   /* allocate per-thread streams */
   bitstream** bs = compress_init_par(stream, field, chunks, blocks);
@@ -239,13 +240,12 @@ _t2(compress_strided_omp, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
     return;
 
   /* compress chunks of blocks in parallel */
-  int chunk;
   #pragma omp parallel for num_threads(threads)
   for (chunk = 0; chunk < (int)chunks; chunk++) {
     /* determine range of block indices assigned to this thread */
-    uint bmin = chunk_offset(blocks, chunks, chunk + 0);
-    uint bmax = chunk_offset(blocks, chunks, chunk + 1);
-    uint block;
+    size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
+    size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
+    size_t block;
     /* set up thread-local bit stream */
     zfp_stream s = *stream;
     zfp_stream_set_bit_stream(&s, bs[chunk]);
@@ -253,15 +253,15 @@ _t2(compress_strided_omp, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
     for (block = bmin; block < bmax; block++) {
       /* determine block origin (x, y, z, w) within array */
       const Scalar* p = data;
-      uint b = block;
-      uint x, y, z, w;
+      size_t b = block;
+      size_t x, y, z, w;
       x = 4 * (b % bx); b /= bx;
       y = 4 * (b % by); b /= by;
       z = 4 * (b % bz); b /= bz;
       w = 4 * b;
       p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w;
       /* compress partial or full block */
-      if (nx - x < 4 || ny - y < 4 || nz - z < 4 || nw - w < 4)
+      if (nx - x < 4u || ny - y < 4u || nz - z < 4u || nw - w < 4u)
         _t2(zfp_encode_partial_block_strided, Scalar, 4)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw);
       else
         _t2(zfp_encode_block_strided, Scalar, 4)(&s, p, sx, sy, sz, sw);
diff --git a/src/template/revdecode.c b/src/template/revdecode.c
index cde9877f..115b0a17 100644
--- a/src/template/revdecode.c
+++ b/src/template/revdecode.c
@@ -31,17 +31,14 @@ _t1(rev_inv_lift, Int)(Int* p, uint s)
 
 /* decode block of integers using reversible algorithm */
 static uint
-_t2(rev_decode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, Int* iblock)
+_t2(rev_decode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, Int* iblock)
 {
   /* decode number of significant bits */
-  int bits = PBITS;
-  int prec = (int)stream_read_bits(stream, PBITS) + 1;
+  uint bits = PBITS;
+  uint prec = (uint)stream_read_bits(stream, PBITS) + 1;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* decode integer coefficients */
-  if (BLOCK_SIZE <= 64)
-    bits += _t1(decode_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
-  else
-    bits += _t1(decode_many_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
+  bits += _t1(decode_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
   /* read at least minbits bits */
   if (bits < minbits) {
     stream_skip(stream, minbits - bits);
diff --git a/src/template/revdecodef.c b/src/template/revdecodef.c
index 221a4b2e..5fafcefc 100644
--- a/src/template/revdecodef.c
+++ b/src/template/revdecodef.c
@@ -30,16 +30,17 @@ _t2(rev_decode_block, Scalar, DIMS)(zfp_stream* zfp, Scalar* fblock)
     bits++;
     if (stream_read_bit(zfp->stream)) {
       /* decode integer block */
-      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, iblock);
+      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, iblock);
       /* reinterpret integers as floating values */
       _t1(rev_inv_reinterpret, Scalar)(iblock, fblock, BLOCK_SIZE);
     }
     else {
       /* decode common exponent */
+      int emax;
       bits += EBITS;
-      int emax = (int)stream_read_bits(zfp->stream, EBITS) - EBIAS;
+      emax = (int)stream_read_bits(zfp->stream, EBITS) - EBIAS;
       /* decode integer block */
-      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, iblock);
+      bits += _t2(rev_decode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, iblock);
       /* perform inverse block-floating-point transform */
       _t1(rev_inv_cast, Scalar)(iblock, fblock, BLOCK_SIZE, emax);
     }
diff --git a/src/template/revencode.c b/src/template/revencode.c
index f76238e9..fa162140 100644
--- a/src/template/revencode.c
+++ b/src/template/revencode.c
@@ -40,7 +40,7 @@ _t1(rev_precision, UInt)(const UInt* block, uint n)
   while (n--)
     m |= *block++;
   /* count trailing zeros via binary search */
-  for (s = CHAR_BIT * (uint)sizeof(UInt); m; s /= 2)
+  for (s = (uint)(CHAR_BIT * sizeof(UInt)); m; s /= 2)
     if ((UInt)(m << (s - 1))) {
       m <<= s - 1;
       m <<= 1;
@@ -51,10 +51,10 @@ _t1(rev_precision, UInt)(const UInt* block, uint n)
 
 /* encode block of integers using reversible algorithm */
 static uint
-_t2(rev_encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, int maxprec, Int* iblock)
+_t2(rev_encode_block, Int, DIMS)(bitstream* stream, uint minbits, uint maxbits, uint maxprec, Int* iblock)
 {
-  int bits = PBITS;
-  int prec;
+  uint bits = PBITS;
+  uint prec;
   cache_align_(UInt ublock[BLOCK_SIZE]);
   /* perform decorrelating transform */
   _t2(rev_fwd_xform, Int, DIMS)(iblock);
@@ -66,10 +66,7 @@ _t2(rev_encode_block, Int, DIMS)(bitstream* stream, int minbits, int maxbits, in
   prec = MAX(prec, 1);
   stream_write_bits(stream, prec - 1, PBITS);
   /* encode integer coefficients */
-  if (BLOCK_SIZE <= 64)
-    bits += _t1(encode_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
-  else
-    bits += _t1(encode_many_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
+  bits += _t1(encode_ints, UInt)(stream, maxbits - bits, prec, ublock, BLOCK_SIZE);
   /* write at least minbits bits by padding with zeros */
   if (bits < minbits) {
     stream_pad(stream, minbits - bits);
diff --git a/src/template/revencodef.c b/src/template/revencodef.c
index 44ef3749..ee270aa7 100644
--- a/src/template/revencodef.c
+++ b/src/template/revencodef.c
@@ -53,7 +53,7 @@ _t2(rev_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
   /* test if block-floating-point transform is reversible */
   if (_t1(rev_fwd_reversible, Scalar)(iblock, fblock, BLOCK_SIZE, emax)) {
     /* transform is reversible; test if block has any non-zeros */
-    uint e = emax + EBIAS;
+    uint e = (uint)(emax + EBIAS);
     if (e) {
       /* encode common exponent */
       bits += 2;
@@ -71,10 +71,10 @@ _t2(rev_encode_block, Scalar, DIMS)(zfp_stream* zfp, const Scalar* fblock)
   else {
     /* transform is irreversible; reinterpret floating values as integers */
     _t1(rev_fwd_reinterpret, Scalar)(iblock, fblock, BLOCK_SIZE);
-    bits++;
+    bits += 2;
     stream_write_bits(zfp->stream, 3, 2);
   }
   /* losslessly encode integers */
-  bits += _t2(rev_encode_block, Int, DIMS)(zfp->stream, zfp->minbits - bits, zfp->maxbits - bits, zfp->maxprec, iblock);
+  bits += _t2(rev_encode_block, Int, DIMS)(zfp->stream, zfp->minbits - MIN(bits, zfp->minbits), zfp->maxbits - bits, zfp->maxprec, iblock);
   return bits;
 }
diff --git a/src/traitsd.h b/src/traitsd.h
index 4dfb271b..05110d55 100644
--- a/src/traitsd.h
+++ b/src/traitsd.h
@@ -7,6 +7,7 @@
 #define PBITS 6                            /* number of bits needed to encode precision */
 #define NBMASK UINT64C(0xaaaaaaaaaaaaaaaa) /* negabinary mask */
 #define TCMASK UINT64C(0x7fffffffffffffff) /* two's complement mask */
+#define SCALAR_MIN DBL_MIN                 /* smallest positive normal number */
 
 #define FABS(x) fabs(x)
 #define FREXP(x, e) frexp(x, e)
diff --git a/src/traitsf.h b/src/traitsf.h
index 408337e1..7e85299d 100644
--- a/src/traitsf.h
+++ b/src/traitsf.h
@@ -7,6 +7,7 @@
 #define PBITS 5            /* number of bits needed to encode precision */
 #define NBMASK 0xaaaaaaaau /* negabinary mask */
 #define TCMASK 0x7fffffffu /* two's complement mask */
+#define SCALAR_MIN FLT_MIN /* smallest positive normal number */
 
 #if __STDC_VERSION__ >= 199901L
   #define FABS(x)     fabsf(x)
diff --git a/src/zfp.c b/src/zfp.c
index 54a0e5f2..a498a985 100644
--- a/src/zfp.c
+++ b/src/zfp.c
@@ -3,35 +3,42 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
+#include "zfp/version.h"
 #include "template/template.h"
 
 /* public data ------------------------------------------------------------- */
 
-export_ const uint zfp_codec_version = ZFP_CODEC;
-export_ const uint zfp_library_version = ZFP_VERSION;
-export_ const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (May 5, 2019)";
+const uint zfp_codec_version = ZFP_CODEC;
+const uint zfp_library_version = ZFP_VERSION;
+const char* const zfp_version_string = "zfp version " ZFP_VERSION_STRING " (August 1, 2022)";
 
 /* private functions ------------------------------------------------------- */
 
-static uint
-type_precision(zfp_type type)
+static size_t
+field_index_span(const zfp_field* field, ptrdiff_t* min, ptrdiff_t* max)
 {
-  switch (type) {
-    case zfp_type_int32:
-      return CHAR_BIT * (uint)sizeof(int32);
-    case zfp_type_int64:
-      return CHAR_BIT * (uint)sizeof(int64);
-    case zfp_type_float:
-      return CHAR_BIT * (uint)sizeof(float);
-    case zfp_type_double:
-      return CHAR_BIT * (uint)sizeof(double);
-    default:
-      return 0;
-  }
+  /* compute strides */
+  ptrdiff_t sx = field->sx ? field->sx : 1;
+  ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)field->nx;
+  ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(field->nx * field->ny);
+  ptrdiff_t sw = field->sw ? field->sw : (ptrdiff_t)(field->nx * field->ny * field->nz);
+  /* compute largest offsets from base pointer */
+  ptrdiff_t dx = field->nx ? sx * (ptrdiff_t)(field->nx - 1) : 0;
+  ptrdiff_t dy = field->ny ? sy * (ptrdiff_t)(field->ny - 1) : 0;
+  ptrdiff_t dz = field->nz ? sz * (ptrdiff_t)(field->nz - 1) : 0;
+  ptrdiff_t dw = field->nw ? sw * (ptrdiff_t)(field->nw - 1) : 0;
+  /* compute lowest and highest offset */
+  ptrdiff_t imin = MIN(dx, 0) + MIN(dy, 0) + MIN(dz, 0) + MIN(dw, 0);
+  ptrdiff_t imax = MAX(dx, 0) + MAX(dy, 0) + MAX(dz, 0) + MAX(dw, 0);
+  if (min)
+    *min = imin;
+  if (max)
+    *max = imax;
+  return (size_t)(imax - imin + 1);
 }
 
-static int
+static zfp_bool
 is_reversible(const zfp_stream* zfp)
 {
   return zfp->minexp < ZFP_MIN_EXP;
@@ -111,7 +118,7 @@ zfp_field_alloc()
 }
 
 zfp_field*
-zfp_field_1d(void* data, zfp_type type, uint nx)
+zfp_field_1d(void* data, zfp_type type, size_t nx)
 {
   zfp_field* field = zfp_field_alloc();
   if (field) {
@@ -123,7 +130,7 @@ zfp_field_1d(void* data, zfp_type type, uint nx)
 }
 
 zfp_field*
-zfp_field_2d(void* data, zfp_type type, uint nx, uint ny)
+zfp_field_2d(void* data, zfp_type type, size_t nx, size_t ny)
 {
   zfp_field* field = zfp_field_alloc();
   if (field) {
@@ -136,7 +143,7 @@ zfp_field_2d(void* data, zfp_type type, uint nx, uint ny)
 }
 
 zfp_field*
-zfp_field_3d(void* data, zfp_type type, uint nx, uint ny, uint nz)
+zfp_field_3d(void* data, zfp_type type, size_t nx, size_t ny, size_t nz)
 {
   zfp_field* field = zfp_field_alloc();
   if (field) {
@@ -150,7 +157,7 @@ zfp_field_3d(void* data, zfp_type type, uint nx, uint ny, uint nz)
 }
 
 zfp_field*
-zfp_field_4d(void* data, zfp_type type, uint nx, uint ny, uint nz, uint nw)
+zfp_field_4d(void* data, zfp_type type, size_t nx, size_t ny, size_t nz, size_t nw)
 {
   zfp_field* field = zfp_field_alloc();
   if (field) {
@@ -176,6 +183,18 @@ zfp_field_pointer(const zfp_field* field)
   return field->data;
 }
 
+void*
+zfp_field_begin(const zfp_field* field)
+{
+  if (field->data) {
+    ptrdiff_t min;
+    field_index_span(field, &min, NULL);
+    return (void*)((uchar*)field->data + min * (ptrdiff_t)zfp_type_size(field->type));
+  }
+  else
+    return NULL;
+}
+
 zfp_type
 zfp_field_type(const zfp_field* field)
 {
@@ -185,7 +204,7 @@ zfp_field_type(const zfp_field* field)
 uint
 zfp_field_precision(const zfp_field* field)
 {
-  return type_precision(field->type);
+  return (uint)(CHAR_BIT * zfp_type_size(field->type));
 }
 
 uint
@@ -195,7 +214,7 @@ zfp_field_dimensionality(const zfp_field* field)
 }
 
 size_t
-zfp_field_size(const zfp_field* field, uint* size)
+zfp_field_size(const zfp_field* field, size_t* size)
 {
   if (size)
     switch (zfp_field_dimensionality(field)) {
@@ -212,22 +231,44 @@ zfp_field_size(const zfp_field* field, uint* size)
         size[0] = field->nx;
         break;
     }
-  return (size_t)MAX(field->nx, 1u) * (size_t)MAX(field->ny, 1u) * (size_t)MAX(field->nz, 1u) * (size_t)MAX(field->nw, 1u);
+  return MAX(field->nx, 1u) * MAX(field->ny, 1u) * MAX(field->nz, 1u) * MAX(field->nw, 1u);
+}
+
+size_t
+zfp_field_size_bytes(const zfp_field* field)
+{
+  return field_index_span(field, NULL, NULL) * zfp_type_size(field->type);
+}
+
+size_t
+zfp_field_blocks(const zfp_field* field)
+{
+  size_t bx = (field->nx + 3) / 4;
+  size_t by = (field->ny + 3) / 4;
+  size_t bz = (field->nz + 3) / 4;
+  size_t bw = (field->nw + 3) / 4;
+  switch (zfp_field_dimensionality(field)) {
+    case 1: return bx;
+    case 2: return bx * by;
+    case 3: return bx * by * bz;
+    case 4: return bx * by * bz * bw;
+    default: return 0;
+  }
 }
 
-int
-zfp_field_stride(const zfp_field* field, int* stride)
+zfp_bool
+zfp_field_stride(const zfp_field* field, ptrdiff_t* stride)
 {
   if (stride)
     switch (zfp_field_dimensionality(field)) {
       case 4:
-        stride[3] = field->sw ? field->sw : (int)(field->nx * field->ny * field->nz);
+        stride[3] = field->sw ? field->sw : (ptrdiff_t)(field->nx * field->ny * field->nz);
         /* FALLTHROUGH */
       case 3:
-        stride[2] = field->sz ? field->sz : (int)(field->nx * field->ny);
+        stride[2] = field->sz ? field->sz : (ptrdiff_t)(field->nx * field->ny);
         /* FALLTHROUGH */
       case 2:
-        stride[1] = field->sy ? field->sy : (int)field->nx;
+        stride[1] = field->sy ? field->sy : (ptrdiff_t)field->nx;
         /* FALLTHROUGH */
       case 1:
         stride[0] = field->sx ? field->sx : 1;
@@ -236,6 +277,12 @@ zfp_field_stride(const zfp_field* field, int* stride)
   return field->sx || field->sy || field->sz || field->sw;
 }
 
+zfp_bool
+zfp_field_is_contiguous(const zfp_field* field)
+{
+  return field_index_span(field, NULL, NULL) == zfp_field_size(field, NULL);
+}
+
 uint64
 zfp_field_metadata(const zfp_field* field)
 {
@@ -304,7 +351,7 @@ zfp_field_set_type(zfp_field* field, zfp_type type)
 }
 
 void
-zfp_field_set_size_1d(zfp_field* field, uint n)
+zfp_field_set_size_1d(zfp_field* field, size_t n)
 {
   field->nx = n;
   field->ny = 0;
@@ -313,7 +360,7 @@ zfp_field_set_size_1d(zfp_field* field, uint n)
 }
 
 void
-zfp_field_set_size_2d(zfp_field* field, uint nx, uint ny)
+zfp_field_set_size_2d(zfp_field* field, size_t nx, size_t ny)
 {
   field->nx = nx;
   field->ny = ny;
@@ -322,7 +369,7 @@ zfp_field_set_size_2d(zfp_field* field, uint nx, uint ny)
 }
 
 void
-zfp_field_set_size_3d(zfp_field* field, uint nx, uint ny, uint nz)
+zfp_field_set_size_3d(zfp_field* field, size_t nx, size_t ny, size_t nz)
 {
   field->nx = nx;
   field->ny = ny;
@@ -331,7 +378,7 @@ zfp_field_set_size_3d(zfp_field* field, uint nx, uint ny, uint nz)
 }
 
 void
-zfp_field_set_size_4d(zfp_field* field, uint nx, uint ny, uint nz, uint nw)
+zfp_field_set_size_4d(zfp_field* field, size_t nx, size_t ny, size_t nz, size_t nw)
 {
   field->nx = nx;
   field->ny = ny;
@@ -340,7 +387,7 @@ zfp_field_set_size_4d(zfp_field* field, uint nx, uint ny, uint nz, uint nw)
 }
 
 void
-zfp_field_set_stride_1d(zfp_field* field, int sx)
+zfp_field_set_stride_1d(zfp_field* field, ptrdiff_t sx)
 {
   field->sx = sx;
   field->sy = 0;
@@ -349,7 +396,7 @@ zfp_field_set_stride_1d(zfp_field* field, int sx)
 }
 
 void
-zfp_field_set_stride_2d(zfp_field* field, int sx, int sy)
+zfp_field_set_stride_2d(zfp_field* field, ptrdiff_t sx, ptrdiff_t sy)
 {
   field->sx = sx;
   field->sy = sy;
@@ -358,7 +405,7 @@ zfp_field_set_stride_2d(zfp_field* field, int sx, int sy)
 }
 
 void
-zfp_field_set_stride_3d(zfp_field* field, int sx, int sy, int sz)
+zfp_field_set_stride_3d(zfp_field* field, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz)
 {
   field->sx = sx;
   field->sy = sy;
@@ -367,7 +414,7 @@ zfp_field_set_stride_3d(zfp_field* field, int sx, int sy, int sz)
 }
 
 void
-zfp_field_set_stride_4d(zfp_field* field, int sx, int sy, int sz, int sw)
+zfp_field_set_stride_4d(zfp_field* field, ptrdiff_t sx, ptrdiff_t sy, ptrdiff_t sz, ptrdiff_t sw)
 {
   field->sx = sx;
   field->sy = sy;
@@ -375,44 +422,113 @@ zfp_field_set_stride_4d(zfp_field* field, int sx, int sy, int sz, int sw)
   field->sw = sw;
 }
 
-int
+zfp_bool
 zfp_field_set_metadata(zfp_field* field, uint64 meta)
 {
   uint64 dims;
   /* ensure value is in range */
   if (meta >> ZFP_META_BITS)
-    return 0;
+    return zfp_false;
   field->type = (zfp_type)((meta & 0x3u) + 1); meta >>= 2;
   dims = (meta & 0x3u) + 1; meta >>= 2;
   switch (dims) {
     case 1:
       /* currently dimensions are limited to 2^32 - 1 */
-      field->nx = (meta & UINT64C(0x0000ffffffff)) + 1; meta >>= 48;
+      field->nx = (size_t)(meta & UINT64C(0x0000ffffffff)) + 1; meta >>= 48;
       field->ny = 0;
       field->nz = 0;
       field->nw = 0;
       break;
     case 2:
-      field->nx = (meta & UINT64C(0xffffff)) + 1; meta >>= 24;
-      field->ny = (meta & UINT64C(0xffffff)) + 1; meta >>= 24;
+      field->nx = (size_t)(meta & UINT64C(0xffffff)) + 1; meta >>= 24;
+      field->ny = (size_t)(meta & UINT64C(0xffffff)) + 1; meta >>= 24;
       field->nz = 0;
       field->nw = 0;
       break;
     case 3:
-      field->nx = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
-      field->ny = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
-      field->nz = (meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->nx = (size_t)(meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->ny = (size_t)(meta & UINT64C(0xffff)) + 1; meta >>= 16;
+      field->nz = (size_t)(meta & UINT64C(0xffff)) + 1; meta >>= 16;
       field->nw = 0;
       break;
     case 4:
-      field->nx = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
-      field->ny = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
-      field->nz = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
-      field->nw = (meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->nx = (size_t)(meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->ny = (size_t)(meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->nz = (size_t)(meta & UINT64C(0xfff)) + 1; meta >>= 12;
+      field->nw = (size_t)(meta & UINT64C(0xfff)) + 1; meta >>= 12;
       break;
   }
   field->sx = field->sy = field->sz = field->sw = 0;
-  return 1;
+  return zfp_true;
+}
+
+/* public functions: compression mode and parameter settings --------------- */
+
+zfp_config
+zfp_config_none()
+{
+  zfp_config config;
+  config.mode = zfp_mode_null;
+  return config;
+}
+
+zfp_config
+zfp_config_rate(
+  double rate,
+  zfp_bool align
+)
+{
+  zfp_config config;
+  config.mode = zfp_mode_fixed_rate;
+  config.arg.rate = align ? -rate : +rate;
+  return config;
+}
+
+zfp_config
+zfp_config_precision(
+  uint precision
+)
+{
+  zfp_config config;
+  config.mode = zfp_mode_fixed_precision;
+  config.arg.precision = precision;
+  return config;
+}
+
+zfp_config
+zfp_config_accuracy(
+  double tolerance
+)
+{
+  zfp_config config;
+  config.mode = zfp_mode_fixed_accuracy;
+  config.arg.tolerance = tolerance;
+  return config;
+}
+
+zfp_config
+zfp_config_reversible()
+{
+  zfp_config config;
+  config.mode = zfp_mode_reversible;
+  return config;
+}
+
+zfp_config
+zfp_config_expert(    
+  uint minbits,
+  uint maxbits,
+  uint maxprec,
+  int minexp
+)
+{
+  zfp_config config;
+  config.mode = zfp_mode_expert;
+  config.arg.expert.minbits = minbits;
+  config.arg.expert.maxbits = maxbits;
+  config.arg.expert.maxprec = maxprec;
+  config.arg.expert.minexp = minexp;
+  return config;
 }
 
 /* public functions: zfp compressed stream --------------------------------- */
@@ -428,6 +544,7 @@ zfp_stream_open(bitstream* stream)
     zfp->maxprec = ZFP_MAX_PREC;
     zfp->minexp = ZFP_MIN_EXP;
     zfp->exec.policy = zfp_exec_serial;
+    zfp->exec.params = NULL;
   }
   return zfp;
 }
@@ -435,6 +552,8 @@ zfp_stream_open(bitstream* stream)
 void
 zfp_stream_close(zfp_stream* zfp)
 {
+  if (zfp->exec.params != NULL)
+    free(zfp->exec.params);
   free(zfp);
 }
 
@@ -488,6 +607,30 @@ zfp_stream_compression_mode(const zfp_stream* zfp)
   return zfp_mode_expert;
 }
 
+double
+zfp_stream_rate(const zfp_stream* zfp, uint dims)
+{
+  return (zfp_stream_compression_mode(zfp) == zfp_mode_fixed_rate)
+           ? (double)zfp->maxbits / (1u << (2 * dims))
+           : 0.0;
+}
+
+uint
+zfp_stream_precision(const zfp_stream* zfp)
+{
+  return (zfp_stream_compression_mode(zfp) == zfp_mode_fixed_precision)
+           ? zfp->maxprec
+           : 0;
+}
+
+double
+zfp_stream_accuracy(const zfp_stream* zfp)
+{
+  return (zfp_stream_compression_mode(zfp) == zfp_mode_fixed_accuracy)
+           ? ldexp(1.0, zfp->minexp)
+           : 0.0;
+}
+
 uint64
 zfp_stream_mode(const zfp_stream* zfp)
 {
@@ -520,7 +663,7 @@ zfp_stream_mode(const zfp_stream* zfp)
         /* minexp is [ZFP_MIN_EXP=-1074, 843] */
         /* returns [2177, ZFP_MODE_SHORT_MAX=4094] */
         /* +1 because skipped 2176 */
-        return (zfp->minexp - ZFP_MIN_EXP) + (2048 + 128 + 1);
+        return (uint64)(zfp->minexp - ZFP_MIN_EXP) + (2048 + 128 + 1);
       else
         break;
 
@@ -536,7 +679,7 @@ zfp_stream_mode(const zfp_stream* zfp)
   minbits = MAX(1, MIN(zfp->minbits, 0x8000u)) - 1;
   maxbits = MAX(1, MIN(zfp->maxbits, 0x8000u)) - 1;
   maxprec = MAX(1, MIN(zfp->maxprec, 0x0080u)) - 1;
-  minexp = MAX(0, MIN(zfp->minexp + 16495, 0x7fff));
+  minexp = (uint)MAX(0, MIN(zfp->minexp + 16495, 0x7fff));
   mode <<= 15; mode += minexp;
   mode <<=  7; mode += maxprec;
   mode <<= 15; mode += maxbits;
@@ -568,34 +711,31 @@ zfp_stream_compressed_size(const zfp_stream* zfp)
 size_t
 zfp_stream_maximum_size(const zfp_stream* zfp, const zfp_field* field)
 {
+  zfp_bool reversible = is_reversible(zfp);
   uint dims = zfp_field_dimensionality(field);
-  uint mx = (MAX(field->nx, 1u) + 3) / 4;
-  uint my = (MAX(field->ny, 1u) + 3) / 4;
-  uint mz = (MAX(field->nz, 1u) + 3) / 4;
-  uint mw = (MAX(field->nw, 1u) + 3) / 4;
-  size_t blocks = (size_t)mx * (size_t)my * (size_t)mz * (size_t)mw;
+  size_t blocks = zfp_field_blocks(field);
   uint values = 1u << (2 * dims);
-  uint maxbits = 1;
+  uint maxbits = 0;
 
   if (!dims)
     return 0;
   switch (field->type) {
-    case zfp_type_none:
-      return 0;
+    case zfp_type_int32:
+      maxbits += reversible ? 5 : 0;
+      break;
+    case zfp_type_int64:
+      maxbits += reversible ? 6 : 0;
+      break;
     case zfp_type_float:
-      maxbits += 8;
-      if (is_reversible(zfp))
-        maxbits += 5;
+      maxbits += reversible ? 1 + 1 + 8 + 5 : 1 + 8;
       break;
     case zfp_type_double:
-      maxbits += 11;
-      if (is_reversible(zfp))
-        maxbits += 6;
+      maxbits += reversible ? 1 + 1 + 11 + 6 : 1 + 11;
       break;
     default:
-      break;
+      return 0;
   }
-  maxbits += values - 1 + values * MIN(zfp->maxprec, type_precision(field->type));
+  maxbits += values - 1 + values * MIN(zfp->maxprec, zfp_field_precision(field));
   maxbits = MIN(maxbits, zfp->maxbits);
   maxbits = MAX(maxbits, zfp->minbits);
   return ((ZFP_HEADER_MAX_BITS + blocks * maxbits + stream_word_bits - 1) & ~(stream_word_bits - 1)) / CHAR_BIT;
@@ -617,7 +757,7 @@ zfp_stream_set_reversible(zfp_stream* zfp)
 }
 
 double
-zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, int wra)
+zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, zfp_bool align)
 {
   uint n = 1u << (2 * dims);
   uint bits = (uint)floor(n * rate + 0.5);
@@ -631,7 +771,7 @@ zfp_stream_set_rate(zfp_stream* zfp, double rate, zfp_type type, uint dims, int
     default:
       break;
   }
-  if (wra) {
+  if (align) {
     /* for write random access, round up to next multiple of stream word size */
     bits += (uint)stream_word_bits - 1;
     bits &= ~(stream_word_bits - 1);
@@ -703,15 +843,15 @@ zfp_stream_set_mode(zfp_stream* zfp, uint64 mode)
       minbits = ZFP_MIN_BITS;
       maxbits = ZFP_MAX_BITS;
       maxprec = ZFP_MAX_PREC;
-      minexp = (uint)mode + ZFP_MIN_EXP - (2048 + 128 + 1);
+      minexp = (int)mode + ZFP_MIN_EXP - (2048 + 128 + 1);
     }
   }
   else {
     /* 64-bit encoding */
-    mode >>= 12; minbits = ((uint)mode & 0x7fffu) + 1;
-    mode >>= 15; maxbits = ((uint)mode & 0x7fffu) + 1;
-    mode >>= 15; maxprec = ((uint)mode & 0x007fu) + 1;
-    mode >>=  7; minexp  = ((uint)mode & 0x7fffu) - 16495;
+    mode >>= 12; minbits = (uint)(mode & 0x7fffu) + 1;
+    mode >>= 15; maxbits = (uint)(mode & 0x7fffu) + 1;
+    mode >>= 15; maxprec = (uint)(mode & 0x007fu) + 1;
+    mode >>=  7; minexp  = (int)(mode & 0x7fffu) - 16495;
   }
 
   if (!zfp_stream_set_params(zfp, minbits, maxbits, maxprec, minexp))
@@ -720,16 +860,16 @@ zfp_stream_set_mode(zfp_stream* zfp, uint64 mode)
   return zfp_stream_compression_mode(zfp);
 }
 
-int
+zfp_bool
 zfp_stream_set_params(zfp_stream* zfp, uint minbits, uint maxbits, uint maxprec, int minexp)
 {
   if (minbits > maxbits || !(0 < maxprec && maxprec <= 64))
-    return 0;
+    return zfp_false;
   zfp->minbits = minbits;
   zfp->maxbits = maxbits;
   zfp->maxprec = maxprec;
   zfp->minexp = minexp;
-  return 1;
+  return zfp_true;
 }
 
 size_t
@@ -761,58 +901,75 @@ zfp_stream_execution(const zfp_stream* zfp)
 uint
 zfp_stream_omp_threads(const zfp_stream* zfp)
 {
-  return zfp->exec.params.omp.threads;
+  if (zfp->exec.policy == zfp_exec_omp) 
+    return ((zfp_exec_params_omp*)zfp->exec.params)->threads;
+  return 0u;
 }
 
 uint
 zfp_stream_omp_chunk_size(const zfp_stream* zfp)
 {
-  return zfp->exec.params.omp.chunk_size;
+  if (zfp->exec.policy == zfp_exec_omp) 
+    return ((zfp_exec_params_omp*)zfp->exec.params)->chunk_size;
+  return 0u;
 }
 
-int
+zfp_bool
 zfp_stream_set_execution(zfp_stream* zfp, zfp_exec_policy policy)
 {
   switch (policy) {
     case zfp_exec_serial:
+      if (zfp->exec.policy != policy && zfp->exec.params != NULL) {
+        free(zfp->exec.params);
+        zfp->exec.params = NULL;
+      }
       break;
 #ifdef ZFP_WITH_CUDA
     case zfp_exec_cuda:
+      if (zfp->exec.policy != policy && zfp->exec.params != NULL) {
+        free(zfp->exec.params);
+        zfp->exec.params = NULL;
+      }
       break;
 #endif
     case zfp_exec_omp:
 #ifdef _OPENMP
       if (zfp->exec.policy != policy) {
-        zfp->exec.params.omp.threads = 0;
-        zfp->exec.params.omp.chunk_size = 0;
+        if (zfp->exec.params != NULL) {
+          free(zfp->exec.params);
+        }
+        zfp_exec_params_omp* params = malloc(sizeof(zfp_exec_params_omp));
+        params->threads = 0;
+        params->chunk_size = 0;
+        zfp->exec.params = (void*)params;
       }
       break;
 #else
-      return 0;
+      return zfp_false;
 #endif
     default:
-      return 0;
+      return zfp_false;
   }
   zfp->exec.policy = policy;
-  return 1;
+  return zfp_true;
 }
 
-int
+zfp_bool
 zfp_stream_set_omp_threads(zfp_stream* zfp, uint threads)
 {
   if (!zfp_stream_set_execution(zfp, zfp_exec_omp))
-    return 0;
-  zfp->exec.params.omp.threads = threads;
-  return 1;
+    return zfp_false;
+  ((zfp_exec_params_omp*)zfp->exec.params)->threads = threads;
+  return zfp_true;
 }
 
-int
+zfp_bool
 zfp_stream_set_omp_chunk_size(zfp_stream* zfp, uint chunk_size)
 {
   if (!zfp_stream_set_execution(zfp, zfp_exec_omp))
-    return 0;
-  zfp->exec.params.omp.chunk_size = chunk_size;
-  return 1;
+    return zfp_false;
+  ((zfp_exec_params_omp*)zfp->exec.params)->chunk_size = chunk_size;
+  return zfp_true;
 }
 
 /* public functions: utility functions --------------------------------------*/
@@ -935,7 +1092,7 @@ zfp_compress(zfp_stream* zfp, const zfp_field* field)
 #endif
   };
   uint exec = zfp->exec.policy;
-  uint strided = zfp_field_stride(field, NULL);
+  uint strided = (uint)zfp_field_stride(field, NULL);
   uint dims = zfp_field_dimensionality(field);
   uint type = field->type;
   void (*compress)(zfp_stream*, const zfp_field*);
@@ -995,7 +1152,7 @@ zfp_decompress(zfp_stream* zfp, zfp_field* field)
 #endif
   };
   uint exec = zfp->exec.policy;
-  uint strided = zfp_field_stride(field, NULL);
+  uint strided = (uint)zfp_field_stride(field, NULL);
   uint dims = zfp_field_dimensionality(field);
   uint type = field->type;
   void (*decompress)(zfp_stream*, zfp_field*);
diff --git a/tests/Makefile b/tests/Makefile
index 2c496ee3..65be9c52 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -2,12 +2,13 @@ include ../Config
 
 BINDIR = ../bin
 TARGETS = $(BINDIR)/testzfp
-CXXLIBS = -L../lib -lzfp
+INCS = -I../include
+LIBS = -L../lib -lzfp $(LDFLAGS)
 
 all: $(TARGETS)
 
 $(BINDIR)/testzfp: testzfp.cpp ../lib/$(LIBZFP)
-	$(CXX) $(CXXFLAGS) -I../array testzfp.cpp $(CXXLIBS) -o $@
+	$(CXX) $(CXXFLAGS) $(INCS) testzfp.cpp $(LIBS) -o $@
 
 test: $(BINDIR)/testzfp
 	$(BINDIR)/testzfp
diff --git a/tests/gitlab/corona-jobs.yml b/tests/gitlab/corona-jobs.yml
new file mode 100644
index 00000000..fe75a6e5
--- /dev/null
+++ b/tests/gitlab/corona-jobs.yml
@@ -0,0 +1,17 @@
+###########
+# HIP GPU #
+###########
+
+rocm-3.10.0_build:
+    variables:
+        ci_cmake: "cmake/3.21.1"
+        ci_cmp_mod: "rocm/3.10.0"
+        ci_cmp_path: "/opt/rocm-3.10.0/hip"
+    extends: [.hip, .corona_build_gpu]
+    needs: []
+
+rocm-3.10.0_test:
+    variables:
+       ci_test_regex: "Hip"
+    extends: [.corona_test_gpu]
+    needs: [rocm-3.10.0_build]
diff --git a/tests/gitlab/corona-templates.yml b/tests/gitlab/corona-templates.yml
new file mode 100644
index 00000000..770bd5a8
--- /dev/null
+++ b/tests/gitlab/corona-templates.yml
@@ -0,0 +1,12 @@
+.corona_job:
+    tags:
+        - batch
+        - corona
+
+.corona_build_gpu:
+    extends: [.build_gpu, .corona_job]
+
+.corona_test_gpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_gpu, .corona_job]
diff --git a/tests/gitlab/gitlab-ci.yml b/tests/gitlab/gitlab-ci.yml
new file mode 100644
index 00000000..5a7ee9bf
--- /dev/null
+++ b/tests/gitlab/gitlab-ci.yml
@@ -0,0 +1,126 @@
+#####################
+# Global Parameters #
+#####################
+
+variables:
+    GIT_SUBMODULE_STRATEGY: recursive
+    LLNL_SLURM_SCHEDULER_PARAMETERS: "--nodes=1 -A asccasc -t 00:15:00"
+
+stages:
+    - build
+    - test
+
+
+####################
+# Global Templates #
+####################
+
+# Build Stage Templates
+
+.build:
+    stage: build
+    artifacts:
+        when: always
+        paths:
+            - build
+
+.build_cpu:
+    before_script:
+        - module reset
+        - module load $ci_cmake
+        - module load $ci_cmp_mod
+        - |-
+            if [ "$ci_lang" == "cpp" ]; then
+                export CXX=$(which $ci_cxx_cmp)
+                export CC=$(which $ci_c_cmp)
+                if [ -z ${CXX} ]; then
+                    echo "cxx compiler not set"
+                    exit 1
+                elif [ -z ${CC} ]; then
+                    echo "c compiler not set"
+                    exit 1
+                fi
+            elif [ "$ci_lang" == "c" ]; then
+                export CC=$(which $ci_c_cmp)
+                if [ -z ${CC} ]; then
+                    echo "c compiler not set"
+                    exit 1
+                fi
+            fi
+    script:
+        - mkdir build
+        - cd build
+        - cmake -DBUILD_TESTING=ON -DBUILD_UTILITIES=OFF -DZFP_WITH_CUDA=OFF ${ci_cmake_flags} ..
+        - make -j
+    extends: [.build]
+
+.build_gpu:
+    before_script:
+        - module reset
+        - module load opt
+        - module load $ci_cmake
+        - module load $ci_cmp_mod
+    script:
+        - mkdir build
+        - cd build
+        - cmake -DBUILD_TESTING=ON -DZFP_WITH_OPENMP=OFF -DBUILD_UTILITIES=OFF ${ci_cmake_flags} ..
+        - make -j
+    extends: [.build]
+
+
+# Test Stage Templates
+
+.test:
+    stage: test
+    artifacts:
+        when: on_failure
+        paths:
+            - build/Testing
+
+.test_cpu:
+    script:
+        - cd build
+        - ctest -E "(Cuda|Hip)" -R "${ci_test_regex}"
+    extends: [.test]
+
+.test_gpu:
+    script:
+        - cd build
+        - ctest -R "${ci_test_regex}"
+    extends: [.test]
+
+
+# Language Templates
+
+.cpp:
+    variables:
+        ci_lang: "cpp"
+        ci_cmake_flags: "-DBUILD_CFP=OFF -DBUILD_ZFPY=OFF -DBUILD_ZFORP=OFF" 
+
+.c:
+    variables:
+        ci_lang: "c"
+        ci_cmake_flags: "-DBUILD_CFP=ON -DBUILD_ZFPY=OFF -DBUILD_ZFORP=OFF -DZFP_WITH_OPENMP=OFF" 
+
+.cuda:
+    variables:
+        ci_lang: "cuda"
+        ci_cmake_flags: "-DZFP_WITH_CUDA=ON"
+
+#.hip:
+#    variables:
+#        ci_lang: "hip"
+#        ci_cmake_flags: "-DZFP_WITH_HIP=ON -DHIP_PATH=${ci_cmp_path} -DCMAKE_CXX_STANDARD=11 -DCMAKE_C_STANDARD=11 -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc"
+
+
+############
+# Includes #
+############
+
+include:
+    - local: tests/gitlab/pascal-templates.yml
+    - local: tests/gitlab/pascal-jobs.yml
+    - local: tests/gitlab/quartz-templates.yml
+    - local: tests/gitlab/quartz-jobs.yml
+#    - local: tests/gitlab/corona-templates.yml
+#    - local: tests/gitlab/corona-jobs.yml
diff --git a/tests/gitlab/pascal-jobs.yml b/tests/gitlab/pascal-jobs.yml
new file mode 100644
index 00000000..3f363655
--- /dev/null
+++ b/tests/gitlab/pascal-jobs.yml
@@ -0,0 +1,16 @@
+############
+# CUDA GPU #
+############
+
+cuda-10.1.168_build:
+    variables:
+        ci_cmake: "cmake/3.9.2"
+        ci_cmp_mod: "cuda/10.1.168"
+    extends: [.cuda, .pascal_build_gpu]
+    needs: []
+
+cuda-10.1.168_test:
+    variables:
+       ci_test_regex: "Cuda"
+    extends: [.pascal_test_gpu]
+    needs: [cuda-10.1.168_build]
diff --git a/tests/gitlab/pascal-templates.yml b/tests/gitlab/pascal-templates.yml
new file mode 100644
index 00000000..30b26269
--- /dev/null
+++ b/tests/gitlab/pascal-templates.yml
@@ -0,0 +1,12 @@
+.pascal_job:
+    tags:
+        - batch
+        - pascal
+
+.pascal_build_gpu:
+    extends: [.build_gpu, .pascal_job]
+
+.pascal_test_gpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_gpu, .pascal_job]
diff --git a/tests/gitlab/quartz-jobs.yml b/tests/gitlab/quartz-jobs.yml
new file mode 100644
index 00000000..071728b3
--- /dev/null
+++ b/tests/gitlab/quartz-jobs.yml
@@ -0,0 +1,77 @@
+###########
+# CXX CPU #
+###########
+
+cpp_gnu-7.3.0_build:
+    variables:
+        ci_cmake: "cmake/3.9.2"
+        ci_cxx_cmp: "g++"
+        ci_c_cmp: "gcc"
+        ci_cmp_mod: "gcc/7.3.0"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_gnu-7.3.0_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_gnu-7.3.0_build]
+
+
+cpp_clang-10.0.0_build:
+    variables:
+        ci_cmake: "cmake/3.9.2"
+        ci_cxx_cmp: "clang++"
+        ci_c_cmp: "clang"
+        ci_cmp_mod: "clang/10.0.0"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_clang-10.0.0_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_clang-10.0.0_build]
+
+
+cpp_intel-19.0.4_build:
+    variables:
+        ci_cmake: "cmake/3.9.2"
+        ci_cxx_cmp: "icpc"
+        ci_c_cmp: "icc"
+        ci_cmp_mod: "intel/19.0.4"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_intel-19.0.4_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_intel-19.0.4_build]
+
+
+cpp_pgi-21.1_build:
+    variables:
+        ci_cmake: "cmake/3.9.2"
+        ci_cxx_cmp: "pgc++"
+        ci_c_cmp: "pgcc"
+        ci_cmp_mod: "pgi/21.1"
+    extends: [.cpp, .quartz_build_cpu]
+    needs: []
+
+cpp_pgi-21.1_test:
+    extends: [.quartz_test_cpu]
+    needs: [cpp_pgi-21.1_build]
+
+
+#########
+# C CPU #
+#########
+
+c_gnu-7.3.0_build:
+    variables:
+        ci_cmake: "cmake/3.9.2"
+        ci_c_cmp: "gcc"
+        ci_cmp_mod: "gcc/7.3.0"
+    extends: [.c, .quartz_build_cpu]
+    needs: []
+
+c_gnu-7.3.0_test:
+    variables:
+       ci_test_regex: "Cfp"
+    extends: [.quartz_test_cpu]
+    needs: [c_gnu-7.3.0_build]
diff --git a/tests/gitlab/quartz-templates.yml b/tests/gitlab/quartz-templates.yml
new file mode 100644
index 00000000..d4d18533
--- /dev/null
+++ b/tests/gitlab/quartz-templates.yml
@@ -0,0 +1,12 @@
+.quartz_job:
+    tags:
+        - batch
+        - quartz
+
+.quartz_build_cpu:
+    extends: [.build_cpu, .quartz_job]
+
+.quartz_test_cpu:
+    variables:
+       ci_test_regex: "."
+    extends: [.test_cpu, .quartz_job]
diff --git a/tests/testzfp.cpp b/tests/testzfp.cpp
index 7469358f..82b4074e 100644
--- a/tests/testzfp.cpp
+++ b/tests/testzfp.cpp
@@ -11,9 +11,10 @@
 #include <sstream>
 #include <string>
 #include "zfp.h"
-#include "zfparray1.h"
-#include "zfparray2.h"
-#include "zfparray3.h"
+#include "zfp/array1.hpp"
+#include "zfp/array2.hpp"
+#include "zfp/array3.hpp"
+#include "zfp/array4.hpp"
 
 enum ArraySize {
   Small  = 0, // 2^12 = 4096 scalars (2^12 = (2^6)^2 = (2^4)^3 = (2^3)^4)
@@ -48,7 +49,7 @@ refine1d(int* g, const int* f, size_t m)
 
   for (size_t x = 0; x < n; x++) {
     int s = 0;
-    for (int i = 0; i < 4; i++) {
+    for (size_t i = 0; i < 4; i++) {
       size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
       s += weight[i] * f[xx];
     }
@@ -66,9 +67,9 @@ refine2d(int* g, const int* f, size_t m)
   for (size_t y = 0; y < n; y++)
     for (size_t x = 0; x < n; x++) {
       int s = 0;
-      for (int j = 0; j < 4; j++) {
+      for (size_t j = 0; j < 4; j++) {
         size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
-        for (int i = 0; i < 4; i++) {
+        for (size_t i = 0; i < 4; i++) {
           size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
           s += weight[i] * weight[j] * f[xx + m * yy];
         }
@@ -88,11 +89,11 @@ refine3d(int* g, const int* f, size_t m)
     for (size_t y = 0; y < n; y++)
       for (size_t x = 0; x < n; x++) {
         int s = 0;
-        for (int k = 0; k < 4; k++) {
+        for (size_t k = 0; k < 4; k++) {
           size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2;
-          for (int j = 0; j < 4; j++) {
+          for (size_t j = 0; j < 4; j++) {
             size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
-            for (int i = 0; i < 4; i++) {
+            for (size_t i = 0; i < 4; i++) {
               size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
               s += weight[i] * weight[j] * weight[k] * f[xx + m * (yy + m * zz)];
             }
@@ -114,13 +115,13 @@ refine4d(int* g, const int* f, size_t m)
       for (size_t y = 0; y < n; y++)
         for (size_t x = 0; x < n; x++) {
           int s = 0;
-          for (int l = 0; l < 4; l++) {
+          for (size_t l = 0; l < 4; l++) {
             size_t ww = w & 1u ? (w / 2 + l - 1 + m) % m : w / 2;
-            for (int k = 0; k < 4; k++) {
+            for (size_t k = 0; k < 4; k++) {
               size_t zz = z & 1u ? (z / 2 + k - 1 + m) % m : z / 2;
-              for (int j = 0; j < 4; j++) {
+              for (size_t j = 0; j < 4; j++) {
                 size_t yy = y & 1u ? (y / 2 + j - 1 + m) % m : y / 2;
-                for (int i = 0; i < 4; i++) {
+                for (size_t i = 0; i < 4; i++) {
                   size_t xx = x & 1u ? (x / 2 + i - 1 + m) % m : x / 2;
                   s += weight[i] * weight[j] * weight[k] * weight[l] * f[xx + m * (yy + m * (zz + m * ww))];
                 }
@@ -270,7 +271,7 @@ template <typename Scalar>
 inline void
 initialize(Scalar* p, uint dims, ArraySize array_size)
 {
-  size_t size = 1ul << ((array_size == Small ? 12 : 24) / dims);
+  size_t size = size_t(1) << ((array_size == Small ? 12 : 24) / dims);
 
   switch (dims) {
     default:
@@ -317,7 +318,7 @@ test_rate(zfp_stream* stream, const zfp_field* input, double rate, Scalar tolera
   zfp_type type = zfp_field_type(input);
 
   // allocate memory for compressed data
-  rate = zfp_stream_set_rate(stream, rate, type, dims, 0);
+  rate = zfp_stream_set_rate(stream, rate, type, dims, zfp_false);
   size_t bufsize = zfp_stream_maximum_size(stream, input);
   uchar* buffer = new uchar[bufsize];
   bitstream* s = stream_open(buffer, bufsize);
@@ -630,6 +631,38 @@ update_array3(zfp::array3<Scalar>& a)
         a(0, 0, 0) = std::max(a(0, 0, 0), a(i, j, k));
 }
 
+// perform 4D differencing
+template <typename Scalar>
+inline void
+update_array4(zfp::array4<Scalar>& a)
+{
+  for (uint l = 0; l < a.size_w(); l++)
+    for (uint k = 0; k < a.size_z(); k++)
+      for (uint j = 0; j < a.size_y(); j++)
+        for (uint i = 0; i < a.size_x() - 1; i++)
+          a(i, j, k, l) -= a(i + 1, j, k, l);
+  for (uint l = 0; l < a.size_w(); l++)
+    for (uint k = 0; k < a.size_z(); k++)
+      for (uint j = 0; j < a.size_y() - 1; j++)
+        for (uint i = 0; i < a.size_x(); i++)
+          a(i, j, k, l) -= a(i, j + 1, k, l);
+  for (uint l = 0; l < a.size_w(); l++)
+    for (uint k = 0; k < a.size_z() - 1; k++)
+      for (uint j = 0; j < a.size_y(); j++)
+        for (uint i = 0; i < a.size_x(); i++)
+          a(i, j, k, l) -= a(i, j, k + 1, l);
+  for (uint l = 0; l < a.size_w() - 1; l++)
+    for (uint k = 0; k < a.size_z(); k++)
+      for (uint j = 0; j < a.size_y(); j++)
+        for (uint i = 0; i < a.size_x(); i++)
+          a(i, j, k, l) -= a(i, j, k, l + 1);
+  for (uint l = 0; l < a.size_w() - 1; l++)
+    for (uint k = 0; k < a.size_z() - 1; k++)
+      for (uint j = 0; j < a.size_y() - 1; j++)
+        for (uint i = 0; i < a.size_x() - 1; i++)
+          a(0, 0, 0, 0) = std::max(a(0, 0, 0, 0), a(i, j, k, l));
+}
+
 template <class Array>
 inline void update_array(Array& a);
 
@@ -657,6 +690,14 @@ template <>
 inline void
 update_array(zfp::array3<double>& a) { update_array3(a); }
 
+template <>
+inline void
+update_array(zfp::array4<float>& a) { update_array4(a); }
+
+template <>
+inline void
+update_array(zfp::array4<double>& a) { update_array4(a); }
+
 // test random-accessible array primitive
 template <class Array, typename Scalar>
 inline uint
@@ -716,9 +757,9 @@ test(uint dims, ArraySize array_size)
   Scalar* f = new Scalar[n];
 
   // determine array size
-  uint nx, ny, nz ,nw;
+  uint nx, ny, nz, nw;
   zfp_field* field = zfp_field_alloc();
-  zfp_field_set_type(field, zfp::codec<Scalar>::type);
+  zfp_field_set_type(field, zfp::internal::trait<Scalar>::type);
   zfp_field_set_pointer(field, f);
   switch (dims) {
     case 1:
@@ -749,7 +790,7 @@ test(uint dims, ArraySize array_size)
   std::cout << "testing " << dims << "D array of " << (t == 0 ? "floats" : "doubles") << std::endl;
 
   // test data integrity
-  uint32 checksum[2][2][4] = {
+  uint32 checksum[2][2][4] = { // [size][type][dims]
     // small
     {{ 0x54174c44u, 0x86609589u, 0xfc0a6a76u, 0xa3481e00u },
      { 0x7d257bb6u, 0x294bb210u, 0x68614d26u, 0xf6bd3a21u }},
@@ -767,7 +808,7 @@ test(uint dims, ArraySize array_size)
   // test fixed rate
   for (uint rate = 2u >> t, i = 0; rate <= 32 * (t + 1); rate *= 4, i++) {
     // expected max errors
-    double emax[2][2][4][4] = {
+    double emax[2][2][4][4] = { // [size][type][dims][rate]
       // small
       {
         {
@@ -808,7 +849,7 @@ test(uint dims, ArraySize array_size)
   // test fixed precision
   for (uint prec = 4u << t, i = 0; i < 3; prec *= 2, i++) {
     // expected compressed sizes
-    size_t bytes[2][2][4][3] = {
+    size_t bytes[2][2][4][3] = { // [size][type][dims][prec]
       // small
       {
         {
@@ -847,7 +888,7 @@ test(uint dims, ArraySize array_size)
   for (uint i = 0; i < 3; i++) {
     Scalar tol[] = { Scalar(1e-3), 2 * std::numeric_limits<Scalar>::epsilon(), 0 };
     // expected compressed sizes
-    size_t bytes[2][2][4][3] = {
+    size_t bytes[2][2][4][3] = { // [size][type][dims][tol]
       // small
       {
         {
@@ -885,7 +926,7 @@ test(uint dims, ArraySize array_size)
   // test reversible
   {
     // expected compressed sizes
-    size_t bytes[2][2][4] = {
+    size_t bytes[2][2][4] = { // [size][type][dims]
       // small
       {
         {
@@ -921,28 +962,28 @@ test(uint dims, ArraySize array_size)
   }
 
   // test compressed array support
-  double emax[2][2][3] = {
+  double emax[2][2][4] = { // [size][type][dims] (construct test)
     // small
     {
-      {4.578e-05, 7.630e-06, 3.148e-05},
-      {1.832e-04, 8.584e-06, 3.338e-05},
+      {4.578e-05, 7.630e-06, 3.148e-05, 3.598e-03},
+      {1.832e-04, 8.584e-06, 3.338e-05, 3.312e-03},
     },
     // large
     {
-      {0.000e+00, 0.000e+00, 0.000e+00},
-      {2.289e-05, 0.000e+00, 0.000e+00},
+      {0.000e+00, 0.000e+00, 0.000e+00, 1.193e-07},
+      {2.289e-05, 0.000e+00, 0.000e+00, 8.801e-08},
     }
   };
-  double dfmax[2][2][3] = {
+  double dfmax[2][2][4] = { // [size][type][dims] (update test)
     // small
     {
-      {2.155e-02, 3.755e-01, 1.846e+00},
-      {2.155e-02, 3.755e-01, 1.846e+00},
+      {2.155e-02, 3.755e-01, 1.846e+00, 4.843e+01},
+      {2.155e-02, 3.755e-01, 1.846e+00, 4.844e+01},
     },
     // large
     {
-      {2.441e-04, 4.883e-04, 1.221e-03},
-      {2.670e-04, 4.883e-04, 1.221e-03},
+      {2.441e-04, 4.883e-04, 1.222e-03, 2.567e-02},
+      {2.670e-04, 4.883e-04, 1.222e-03, 2.567e-02},
     }
   };
   double rate = 16;
@@ -962,7 +1003,10 @@ test(uint dims, ArraySize array_size)
         failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1]));
       }
       break;
-    case 4: // 4D arrays not yet supported
+    case 4: {
+        zfp::array4<Scalar> a(nx, ny, nz, nw, rate, f);
+        failures += test_array(a, f, n, static_cast<Scalar>(emax[array_size][t][dims - 1]), static_cast<Scalar>(dfmax[array_size][t][dims - 1]));
+      }
       break;
   }
 
@@ -979,6 +1023,7 @@ inline uint
 common_tests()
 {
   uint failures = 0;
+  uint warnings = 0;
   // test library version
   if (zfp_codec_version != ZFP_CODEC || zfp_library_version != ZFP_VERSION) {
     std::cout << "library header and binary version mismatch" << std::endl;
@@ -1021,6 +1066,21 @@ common_tests()
     std::cout << "regression testing requires BIT_STREAM_WORD_TYPE=uint64" << std::endl;
     failures++;
   }
+  // warn if non-default compiler options are used
+#if ZFP_ROUNDING_MODE != 0
+  std::cout << "warning: selected ZFP_ROUNDING_MODE may break tests" << std::endl;
+  warnings++;
+#ifdef ZFP_WITH_TIGHT_ERROR
+  std::cout << "warning: ZFP_WITH_TIGHT_ERROR option may break tests" << std::endl;
+  warnings++;
+#endif
+#endif
+#ifdef ZFP_WITH_DAZ
+  std::cout << "warning: ZFP_WITH_DAZ option may break tests" << std::endl;
+  warnings++;
+#endif
+  if (failures || warnings)
+    std::cout << std::endl;
   return failures;
 }
 
diff --git a/travis.sh b/travis.sh
deleted file mode 100755
index e73383cd..00000000
--- a/travis.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env sh
-set -e
-
-# pass additional args in $1 (starting with whitespace character)
-run_all () {
-  run_all_cmd="ctest -V -C Debug -DC_STANDARD=${C_STANDARD:-99} -DCXX_STANDARD=${CXX_STANDARD:-98} -S \"$TRAVIS_BUILD_DIR/cmake/travis.cmake\""
-  eval "${run_all_cmd}$1"
-}
-
-mkdir build
-cd build
-
-# technically, flags are passed on to cmake/* and actually set there
-BUILD_FLAGS=""
-
-if [ -n "${COVERAGE}" ]; then
-  # build (linux)
-
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_UTILITIES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_EXAMPLES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CFP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFPY=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFORP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DZFP_WITH_ALIGNED_ALLOC=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CUDA=OFF"
-  BUILD_FLAGS="$BUILD_FLAGS -DWITH_COVERAGE=ON"
-
-  run_all "$BUILD_FLAGS"
-else
-  # build/test without OpenMP, with CFP (and custom namespace), with zfPy, with Fortran (linux only)
-  if [[ "$OSTYPE" == "darwin"* ]]; then
-    BUILD_ZFORP=OFF
-  else
-    BUILD_ZFORP=ON
-  fi
-
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_UTILITIES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_EXAMPLES=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CFP=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DCFP_NAMESPACE=cfp2"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFPY=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_ZFORP=$BUILD_ZFORP"
-  BUILD_FLAGS="$BUILD_FLAGS -DZFP_WITH_ALIGNED_ALLOC=ON"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=OFF"
-  BUILD_FLAGS="$BUILD_FLAGS -DBUILD_CUDA=OFF"
-  run_all "$BUILD_FLAGS"
-
-  rm -rf ./* ;
-
-  # if OpenMP available, start a 2nd build with it
-  if cmake ../tests/ci-utils/ ; then
-    rm -rf ./* ;
-
-    # build/test with OpenMP
-    BUILD_FLAGS=""
-    BUILD_FLAGS="$BUILD_FLAGS -DBUILD_OPENMP=ON"
-    run_all "$BUILD_FLAGS"
-  fi
-fi
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
index a960d2c9..1aa7a930 100644
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -1,6 +1,16 @@
 add_executable(zfpcmd zfp.c)
-set_property(TARGET zfpcmd PROPERTY OUTPUT_NAME zfp)
+
+# protect against LNK1114: cannot overwrite the original file 'lib/Release/zfp.lib'; error code 32;
+# rationale: linker can't handle the case of an executable file having the same name as a library file
+if(NOT MSVC)
+  set_property(TARGET zfpcmd PROPERTY OUTPUT_NAME zfp)
+endif()
 target_link_libraries(zfpcmd zfp)
 if(HAVE_LIBM_MATH)
   target_link_libraries(zfpcmd m)
 endif()
+
+if(BUILD_UTILITIES)
+  install(TARGETS zfpcmd
+    DESTINATION "${CMAKE_INSTALL_BINDIR}")
+endif()
diff --git a/utils/Makefile b/utils/Makefile
index 50a40ce9..dc7ef3e9 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -1,12 +1,14 @@
 include ../Config
 
 TARGET = ../bin/zfp
+INCS = -I../include
+LIBS = -L../lib -lzfp $(LDFLAGS) -lm
 
 all: $(TARGET)
 
 $(TARGET): zfp.c ../lib/$(LIBZFP)
 	mkdir -p ../bin
-	$(CC) $(CFLAGS) zfp.c -L../lib -lzfp -lm -o $(TARGET)
+	$(CC) $(CFLAGS) $(INCS) zfp.c $(LIBS) -o $(TARGET)
 
 clean:
 	rm -f $(TARGET) fields.o
diff --git a/utils/zfp.c b/utils/zfp.c
index 97a621f5..3520f743 100644
--- a/utils/zfp.c
+++ b/utils/zfp.c
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "zfp.h"
-#include "zfp/macros.h"
+#include "zfp/internal/zfp/macros.h"
 
 /*
 File I/O is done using the following combinations of i, o, s, and z:
@@ -137,10 +137,10 @@ int main(int argc, char* argv[])
   zfp_type type = zfp_type_none;
   size_t typesize = 0;
   uint dims = 0;
-  uint nx = 0;
-  uint ny = 0;
-  uint nz = 0;
-  uint nw = 0;
+  size_t nx = 0;
+  size_t ny = 0;
+  size_t nz = 0;
+  size_t nw = 0;
   size_t count = 0;
   double rate = 0;
   uint precision = 0;
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
   uint maxbits = ZFP_MAX_BITS;
   uint maxprec = ZFP_MAX_PREC;
   int minexp = ZFP_MIN_EXP;
-  int header = 0;
-  int quiet = 0;
-  int stats = 0;
+  zfp_bool header = zfp_false;
+  zfp_bool quiet = zfp_false;
+  zfp_bool stats = zfp_false;
   char* inpath = 0;
   char* zfppath = 0;
   char* outpath = 0;
@@ -181,31 +181,31 @@ int main(int argc, char* argv[])
       usage();
     switch (argv[i][1]) {
       case '1':
-        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1)
+        if (++i == argc || sscanf(argv[i], "%zu", &nx) != 1)
           usage();
         ny = nz = nw = 1;
         dims = 1;
         break;
       case '2':
-        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 ||
-            ++i == argc || sscanf(argv[i], "%u", &ny) != 1)
+        if (++i == argc || sscanf(argv[i], "%zu", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%zu", &ny) != 1)
           usage();
         nz = nw = 1;
         dims = 2;
         break;
       case '3':
-        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 ||
-            ++i == argc || sscanf(argv[i], "%u", &ny) != 1 ||
-            ++i == argc || sscanf(argv[i], "%u", &nz) != 1)
+        if (++i == argc || sscanf(argv[i], "%zu", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%zu", &ny) != 1 ||
+            ++i == argc || sscanf(argv[i], "%zu", &nz) != 1)
           usage();
         nw = 1;
         dims = 3;
         break;
       case '4':
-        if (++i == argc || sscanf(argv[i], "%u", &nx) != 1 ||
-            ++i == argc || sscanf(argv[i], "%u", &ny) != 1 ||
-            ++i == argc || sscanf(argv[i], "%u", &nz) != 1 ||
-            ++i == argc || sscanf(argv[i], "%u", &nw) != 1)
+        if (++i == argc || sscanf(argv[i], "%zu", &nx) != 1 ||
+            ++i == argc || sscanf(argv[i], "%zu", &ny) != 1 ||
+            ++i == argc || sscanf(argv[i], "%zu", &nz) != 1 ||
+            ++i == argc || sscanf(argv[i], "%zu", &nw) != 1)
           usage();
         dims = 4;
         break;
@@ -229,7 +229,7 @@ int main(int argc, char* argv[])
         type = zfp_type_float;
         break;
       case 'h':
-        header = 1;
+        header = zfp_true;
         break;
       case 'i':
         if (++i == argc)
@@ -247,7 +247,7 @@ int main(int argc, char* argv[])
         mode = 'p';
         break;
       case 'q':
-        quiet = 1;
+        quiet = zfp_true;
         break;
       case 'r':
         if (++i == argc || sscanf(argv[i], "%lf", &rate) != 1)
@@ -258,7 +258,7 @@ int main(int argc, char* argv[])
         mode = 'R';
         break;
       case 's':
-        stats = 1;
+        stats = zfp_true;
         break;
       case 't':
         if (++i == argc)
@@ -307,7 +307,7 @@ int main(int argc, char* argv[])
   }
 
   typesize = zfp_type_size(type);
-  count = (size_t)nx * (size_t)ny * (size_t)nz * (size_t)nw;
+  count = nx * ny * nz * nw;
 
   /* make sure one of the array dimensions is not zero */
   if (!count && dims) {
@@ -456,7 +456,7 @@ int main(int argc, char* argv[])
         zfp_stream_set_precision(zfp, precision);
         break;
       case 'r':
-        zfp_stream_set_rate(zfp, rate, type, dims, 0);
+        zfp_stream_set_rate(zfp, rate, type, dims, zfp_false);
         break;
       case 'c':
         if (!maxbits)
@@ -556,22 +556,16 @@ int main(int argc, char* argv[])
         return EXIT_FAILURE;
       }
       type = field->type;
-      switch (type) {
-        case zfp_type_float:
-          typesize = sizeof(float);
-          break;
-        case zfp_type_double:
-          typesize = sizeof(double);
-          break;
-        default:
-          fprintf(stderr, "unsupported type\n");
-          return EXIT_FAILURE;
+      typesize = zfp_type_size(type);
+      if (!typesize) {
+        fprintf(stderr, "unsupported type\n");
+        return EXIT_FAILURE;
       }
       nx = MAX(field->nx, 1u);
       ny = MAX(field->ny, 1u);
       nz = MAX(field->nz, 1u);
       nw = MAX(field->nw, 1u);
-      count = (size_t)nx * (size_t)ny * (size_t)nz * (size_t)nw;
+      count = nx * ny * nz * nw;
     }
 
     /* allocate memory for decompressed data */
@@ -616,7 +610,7 @@ int main(int argc, char* argv[])
   /* print compression and error statistics */
   if (!quiet) {
     const char* type_name[] = { "int32", "int64", "float", "double" };
-    fprintf(stderr, "type=%s nx=%u ny=%u nz=%u nw=%u", type_name[type - zfp_type_int32], nx, ny, nz, nw);
+    fprintf(stderr, "type=%s nx=%zu ny=%zu nz=%zu nw=%zu", type_name[type - zfp_type_int32], nx, ny, nz, nw);
     fprintf(stderr, " raw=%lu zfp=%lu ratio=%.3g rate=%.4g", (unsigned long)rawsize, (unsigned long)zfpsize, (double)rawsize / zfpsize, CHAR_BIT * (double)zfpsize / count);
     if (stats)
       print_error(fi, fo, type, count);
diff --git a/zfp-config-version.cmake.in b/zfp-config-version.cmake.in
index 4a77db0a..44932702 100644
--- a/zfp-config-version.cmake.in
+++ b/zfp-config-version.cmake.in
@@ -1,6 +1,8 @@
 set(PACKAGE_VERSION_MAJOR @ZFP_VERSION_MAJOR@)
 set(PACKAGE_VERSION_MINOR @ZFP_VERSION_MINOR@)
 set(PACKAGE_VERSION_PATCH @ZFP_VERSION_PATCH@)
+set(PACKAGE_VERSION_TWEAK @ZFP_VERSION_TWEAK@)
+
 set(PACKAGE_VERSION @ZFP_VERSION@)
 
 # Check whether the requested PACKAGE_FIND_VERSION is compatible
diff --git a/zfp-config.cmake.in b/zfp-config.cmake.in
index 642f6178..87ceb5d2 100644
--- a/zfp-config.cmake.in
+++ b/zfp-config.cmake.in
@@ -3,6 +3,8 @@
 # It defines the following variables
 #  ZFP_INCLUDE_DIRS - include directories for zfp
 #  ZFP_LIBRARIES    - libraries to link against
+#  ZFP_WITH_OPENMP  - indicates if the zfp library has been built with OpenMP support
+#  ZFP_WITH_CUDA    - indicates if the zfp library has been built with CUDA support
 #
 # And the following imported targets:
 #   zfp::zfp
@@ -19,6 +21,11 @@ if(NOT TARGET zfp::zfp)
 endif()
 
 set(ZFP_LIBRARIES zfp::zfp)
-set(ZFP_INCLUDE_DIRS
-  $<TARGET_PROPERTY:zfp::zfp,INTERFACE_INCLUDE_DIRECTORIES>
-)
+get_target_property(ZFP_INCLUDE_DIRS zfp::zfp INTERFACE_INCLUDE_DIRECTORIES)
+
+set(ZFP_WITH_OPENMP @ZFP_WITH_OPENMP@)
+if(ZFP_WITH_OPENMP)
+  find_package(OpenMP REQUIRED COMPONENTS C)
+endif()
+
+set(ZFP_WITH_CUDA @ZFP_WITH_CUDA@)