diff --git a/.travis.yml b/.travis.yml index 78c638d8b..6747fe30b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,12 @@ language: cpp matrix: include: +# OSX - os: osx compiler: clang env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1 + +# Test code style - os: linux compiler: clang-3.8 env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_CHECK_STYLE=1 OPJ_CI_SKIP_TESTS=1 @@ -16,12 +19,31 @@ matrix: packages: - clang-3.8 - flip + +# Performance test with GCC - os: linux compiler: g++ env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1 OPJ_CI_PERF_TESTS=1 + +# Test compilation with AVX2 + - os: linux + compiler: clang-3.8 + # skip tests since Travis doesn't have AVX2 compatible machines + env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_INSTRUCTION_SETS="-mavx2" OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_SKIP_TESTS=1 + addons: + apt: + sources: + - llvm-toolchain-precise-3.8 + - ubuntu-toolchain-r-test + packages: + - clang-3.8 + +# Test multi-threading - os: linux compiler: g++ env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_NUM_THREADS=2 + +# Test 32-bit compilation - os: linux compiler: g++ env: OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release @@ -30,6 +52,8 @@ matrix: packages: - gcc-multilib - g++-multilib + +# Profile code (gcc -pg) - os: linux compiler: g++ env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Debug OPJ_CI_PROFILE=1 @@ -37,9 +61,13 @@ matrix: apt: packages: - valgrind + +# Test under ASAN - os: linux compiler: clang env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Debug OPJ_CI_ASAN=1 + +# Test with CLang 3.8 - os: linux compiler: clang-3.8 env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_PERF_TESTS=1 @@ -50,6 +78,8 @@ matrix: - ubuntu-toolchain-r-test packages: - clang-3.8 + +# Test with mingw 32 bit - os: linux compiler: x86_64-w64-mingw32-g++ env: OPJ_CI_CC=x86_64-w64-mingw32-gcc OPJ_CI_CXX=x86_64-w64-mingw32-g++ OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release @@ -63,6 +93,8 @@ matrix: - g++-mingw-w64-i686 - gcc-multilib - g++-multilib + +# Test with mingw 64 bit - os: linux compiler: x86_64-w64-mingw32-g++ env: OPJ_CI_CC=x86_64-w64-mingw32-gcc OPJ_CI_CXX=x86_64-w64-mingw32-g++ OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release @@ -74,6 +106,8 @@ matrix: - gcc-mingw-w64-x86-64 - gcc-mingw-w64 - g++-mingw-w64-x86-64 + +# Test with gcc 4.8 - os: linux compiler: g++-4.8 env: OPJ_CI_CC=gcc-4.8 OPJ_CI_CXX=g++-4.8 OPJ_CI_ABI_CHECK=1 diff --git a/CMakeLists.txt b/CMakeLists.txt index c56f4814c..76aa95eb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,6 +253,7 @@ if(BUILD_JPIP_SERVER) endif() add_subdirectory(src/lib) option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF) +option(BUILD_BENCH_DWT "Build bench_dwt utility (development benchmark)" OFF) #----------------------------------------------------------------------------- # Build Applications diff --git a/src/lib/openjp2/CMakeLists.txt b/src/lib/openjp2/CMakeLists.txt index ad77c6e33..0e8e9a9cd 100644 --- a/src/lib/openjp2/CMakeLists.txt +++ b/src/lib/openjp2/CMakeLists.txt @@ -183,3 +183,13 @@ endif(OPJ_USE_THREAD AND NOT Threads_FOUND) if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT}) endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + +if(BUILD_BENCH_DWT) + add_executable(bench_dwt bench_dwt.c dwt.c opj_malloc.c thread.c) + if(UNIX) + target_link_libraries(bench_dwt m) + endif() + if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + target_link_libraries(bench_dwt ${CMAKE_THREAD_LIBS_INIT}) + endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) +endif(BUILD_BENCH_DWT) diff --git a/src/lib/openjp2/bench_dwt.c b/src/lib/openjp2/bench_dwt.c new file mode 100644 index 000000000..5b07365ca --- /dev/null +++ b/src/lib/openjp2/bench_dwt.c @@ -0,0 +1,241 @@ +/* + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third + * party and contributor rights, including patent rights, and no such rights + * are granted under this license. + * + * Copyright (c) 2017, IntoPix SA + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opj_includes.h" + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#endif /* _WIN32 */ + +OPJ_INT32 getValue(OPJ_UINT32 i) +{ + return ((OPJ_INT32)i % 511) - 256; +} + +void init_tilec(opj_tcd_tilecomp_t * l_tilec, + OPJ_INT32 x0, + OPJ_INT32 y0, + OPJ_INT32 x1, + OPJ_INT32 y1, + OPJ_UINT32 numresolutions) +{ + opj_tcd_resolution_t* l_res; + OPJ_UINT32 resno, l_level_no; + size_t i, nValues; + + memset(l_tilec, 0, sizeof(*l_tilec)); + l_tilec->x0 = x0; + l_tilec->y0 = y0; + l_tilec->x1 = x1; + l_tilec->y1 = y1; + nValues = (size_t)(l_tilec->x1 - l_tilec->x0) * + (size_t)(l_tilec->y1 - l_tilec->y0); + l_tilec->data = opj_malloc(sizeof(OPJ_INT32) * nValues); + for (i = 0; i < nValues; i++) { + l_tilec->data[i] = getValue(i); + } + l_tilec->numresolutions = numresolutions; + l_tilec->resolutions = opj_calloc(l_tilec->numresolutions, + sizeof(opj_tcd_resolution_t)); + + l_level_no = l_tilec->numresolutions; + l_res = l_tilec->resolutions; + + /* Adapted from opj_tcd_init_tile() */ + for (resno = 0; resno < l_tilec->numresolutions; ++resno) { + + --l_level_no; + + /* border for each resolution level (global) */ + l_res->x0 = opj_int_ceildivpow2(l_tilec->x0, (OPJ_INT32)l_level_no); + l_res->y0 = opj_int_ceildivpow2(l_tilec->y0, (OPJ_INT32)l_level_no); + l_res->x1 = opj_int_ceildivpow2(l_tilec->x1, (OPJ_INT32)l_level_no); + l_res->y1 = opj_int_ceildivpow2(l_tilec->y1, (OPJ_INT32)l_level_no); + + ++l_res; + } +} + +void free_tilec(opj_tcd_tilecomp_t * l_tilec) +{ + opj_free(l_tilec->data); + opj_free(l_tilec->resolutions); +} + +void usage(void) +{ + printf( + "bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n"); + printf( + " [-offset x y] [-num_threads val]\n"); + exit(1); +} + + +OPJ_FLOAT64 opj_clock(void) +{ +#ifdef _WIN32 + /* _WIN32: use QueryPerformance (very accurate) */ + LARGE_INTEGER freq, t ; + /* freq is the clock speed of the CPU */ + QueryPerformanceFrequency(&freq) ; + /* cout << "freq = " << ((double) freq.QuadPart) << endl; */ + /* t is the high resolution performance counter (see MSDN) */ + QueryPerformanceCounter(& t) ; + return freq.QuadPart ? (t.QuadPart / (OPJ_FLOAT64) freq.QuadPart) : 0 ; +#else + /* Unix or Linux: use resource usage */ + struct rusage t; + OPJ_FLOAT64 procTime; + /* (1) Get the rusage data structure at this moment (man getrusage) */ + getrusage(0, &t); + /* (2) What is the elapsed time ? - CPU time = User time + System time */ + /* (2a) Get the seconds */ + procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec); + /* (2b) More precisely! Get the microseconds part ! */ + return (procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) * + 1e-6) ; +#endif +} + +int main(int argc, char** argv) +{ + int num_threads = 0; + opj_tcd_tilecomp_t tilec; + opj_thread_pool_t* tp; + OPJ_INT32 i, j, k; + OPJ_BOOL display = OPJ_FALSE; + OPJ_BOOL check = OPJ_FALSE; + OPJ_INT32 size = 16384 - 1; + OPJ_FLOAT64 start, stop; + OPJ_UINT32 offset_x = (size + 1) / 2 - 1; + OPJ_UINT32 offset_y = (size + 1) / 2 - 1; + OPJ_UINT32 num_resolutions = 6; + + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-display") == 0) { + display = OPJ_TRUE; + check = OPJ_TRUE; + } else if (strcmp(argv[i], "-check") == 0) { + check = OPJ_TRUE; + } else if (strcmp(argv[i], "-size") == 0 && i + 1 < argc) { + size = atoi(argv[i + 1]); + i ++; + } else if (strcmp(argv[i], "-num_threads") == 0 && i + 1 < argc) { + num_threads = atoi(argv[i + 1]); + i ++; + } else if (strcmp(argv[i], "-num_resolutions") == 0 && i + 1 < argc) { + num_resolutions = atoi(argv[i + 1]); + if (num_resolutions == 0 || num_resolutions > 32) { + fprintf(stderr, + "Invalid value for num_resolutions. Should be >= 1 and <= 32\n"); + exit(1); + } + i ++; + } else if (strcmp(argv[i], "-offset") == 0 && i + 2 < argc) { + offset_x = atoi(argv[i + 1]); + offset_y = atoi(argv[i + 2]); + i += 2; + } else { + usage(); + } + } + + tp = opj_thread_pool_create(num_threads); + + init_tilec(&tilec, offset_x, offset_y, offset_x + size, offset_y + size, + num_resolutions); + + if (display) { + printf("Before\n"); + k = 0; + for (j = 0; j < tilec.y1 - tilec.y0; j++) { + for (i = 0; i < tilec.x1 - tilec.x0; i++) { + printf("%d ", tilec.data[k]); + k ++; + } + printf("\n"); + } + } + + start = opj_clock(); + opj_dwt_decode(tp, &tilec, tilec.numresolutions); + stop = opj_clock(); + printf("time for dwt_decode: %.03f s\n", stop - start); + + if (display || check) { + if (display) { + printf("After IDWT\n"); + k = 0; + for (j = 0; j < tilec.y1 - tilec.y0; j++) { + for (i = 0; i < tilec.x1 - tilec.x0; i++) { + printf("%d ", tilec.data[k]); + k ++; + } + printf("\n"); + } + } + + opj_dwt_encode(&tilec); + if (display) { + printf("After FDWT\n"); + k = 0; + for (j = 0; j < tilec.y1 - tilec.y0; j++) { + for (i = 0; i < tilec.x1 - tilec.x0; i++) { + printf("%d ", tilec.data[k]); + k ++; + } + printf("\n"); + } + } + + if (check) { + size_t idx; + size_t nValues = (size_t)(tilec.x1 - tilec.x0) * + (size_t)(tilec.y1 - tilec.y0); + for (idx = 0; i < nValues; i++) { + if (tilec.data[idx] != getValue(idx)) { + printf("Difference found at idx = %u\n", (OPJ_UINT32)idx); + exit(1); + } + } + } + } + + free_tilec(&tilec); + + opj_thread_pool_destroy(tp); + return 0; +} diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index b5f570429..4a5ba609c 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -13,6 +13,7 @@ * Copyright (c) 2005, Herve Drolon, FreeImage Team * Copyright (c) 2007, Jonathan Ballard * Copyright (c) 2007, Callum Lerwick + * Copyright (c) 2017, IntoPIX SA * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,11 +38,27 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include + +#define OPJ_SKIP_POISON +#include "opj_includes.h" + #ifdef __SSE__ #include #endif +#ifdef __SSE2__ +#include +#endif +#ifdef __SSSE3__ +#include +#endif +#ifdef __AVX2__ +#include +#endif -#include "opj_includes.h" +#if defined(__GNUC__) +#pragma GCC poison malloc calloc realloc free +#endif /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */ /*@{*/ @@ -49,6 +66,17 @@ #define OPJ_WS(i) v->mem[(i)*2] #define OPJ_WD(i) v->mem[(1+(i)*2)] +#ifdef __AVX2__ +/** Number of int32 values in a AVX2 register */ +#define VREG_INT_COUNT 8 +#else +/** Number of int32 values in a SSE2 register */ +#define VREG_INT_COUNT 4 +#endif + +/** Number of columns that we can process in parallel in the vertical pass */ +#define PARALLEL_COLS_53 (2*VREG_INT_COUNT) + /** @name Local data structures */ /*@{*/ @@ -83,7 +111,7 @@ static const OPJ_FLOAT32 opj_c13318 = 1.625732422f; /** Virtual function type for wavelet transform in 1-D */ -typedef void (*DWT1DFN)(opj_dwt_t* v); +typedef void (*DWT1DFN)(const opj_dwt_t* v); /** @name Local static functions */ /*@{*/ @@ -99,25 +127,11 @@ Forward lazy transform (vertical) static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas); /** -Inverse lazy transform (horizontal) -*/ -static void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a); -/** -Inverse lazy transform (vertical) -*/ -static void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x); -/** Forward 5-3 wavelet transform in 1-D */ static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); /** -Inverse 5-3 wavelet transform in 1-D -*/ -static void opj_dwt_decode_1(opj_dwt_t *v); -static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, - OPJ_INT32 cas); -/** Forward 9-7 wavelet transform in 1-D */ static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, @@ -131,7 +145,7 @@ static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, Inverse wavelet transform in 2-D. */ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, - opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn); + opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i); static OPJ_BOOL opj_dwt_encode_procedure(opj_tcd_tilecomp_t * tilec, void (*p_function)(OPJ_INT32 *, OPJ_INT32, OPJ_INT32, OPJ_INT32)); @@ -255,10 +269,11 @@ static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/ } +#ifdef STANDARD_SLOW_VERSION /* */ /* Inverse lazy transform (horizontal). */ /* */ -static void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a) +static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) { OPJ_INT32 *ai = a; OPJ_INT32 *bi = h->mem + h->cas; @@ -279,7 +294,7 @@ static void opj_dwt_interleave_h(opj_dwt_t* h, OPJ_INT32 *a) /* */ /* Inverse lazy transform (vertical). */ /* */ -static void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) +static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) { OPJ_INT32 *ai = a; OPJ_INT32 *bi = v->mem + v->cas; @@ -299,6 +314,7 @@ static void opj_dwt_interleave_v(opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) } } +#endif /* STANDARD_SLOW_VERSION */ /* */ /* Forward 5-3 wavelet transform in 1-D. */ @@ -331,6 +347,7 @@ static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, } } +#ifdef STANDARD_SLOW_VERSION /* */ /* Inverse 5-3 wavelet transform in 1-D. */ /* */ @@ -362,14 +379,634 @@ static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, } } +static void opj_dwt_decode_1(const opj_dwt_t *v) +{ + opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas); +} + +#endif /* STANDARD_SLOW_VERSION */ + +#if !defined(STANDARD_SLOW_VERSION) +static void opj_idwt53_h_cas0(OPJ_INT32* tmp, + const OPJ_INT32 sn, + const OPJ_INT32 len, + OPJ_INT32* tiledp) +{ + OPJ_INT32 i, j; + const OPJ_INT32* in_even = &tiledp[0]; + const OPJ_INT32* in_odd = &tiledp[sn]; + +#ifdef TWO_PASS_VERSION + /* For documentation purpose: performs lifting in two iterations, */ + /* but withtmp explicit interleaving */ + + assert(len > 1); + + /* Even */ + tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1); + for (i = 2, j = 0; i <= len - 2; i += 2, j++) { + tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2); + } + if (len & 1) { /* if len is odd */ + tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1); + } + + /* Odd */ + for (i = 1, j = 0; i < len - 1; i += 2, j++) { + tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1); + } + if (!(len & 1)) { /* if len is even */ + tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2]; + } +#else + OPJ_INT32 d1c, d1n, s1n, s0c, s0n; + + assert(len > 1); + + /* Improved version of the TWO_PASS_VERSION: */ + /* Performs lifting in one single iteration. Saves memory */ + /* accesses and explicit interleaving. */ + s1n = in_even[0]; + d1n = in_odd[0]; + s0n = s1n - ((d1n + 1) >> 1); + + for (i = 0, j = 1; i < (len - 3); i += 2, j++) { + d1c = d1n; + s0c = s0n; + + s1n = in_even[j]; + d1n = in_odd[j]; + + s0n = s1n - ((d1c + d1n + 2) >> 2); + + tmp[i ] = s0c; + tmp[i + 1] = d1c + ((s0c + s0n) >> 1); + } + + tmp[i] = s0n; + + if (len & 1) { + tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1); + tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); + } else { + tmp[len - 1] = d1n + s0n; + } +#endif + memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); +} + +static void opj_idwt53_h_cas1(OPJ_INT32* tmp, + const OPJ_INT32 sn, + const OPJ_INT32 len, + OPJ_INT32* tiledp) +{ + OPJ_INT32 i, j; + const OPJ_INT32* in_even = &tiledp[sn]; + const OPJ_INT32* in_odd = &tiledp[0]; + +#ifdef TWO_PASS_VERSION + /* For documentation purpose: performs lifting in two iterations, */ + /* but withtmp explicit interleaving */ + + assert(len > 2); + + /* Odd */ + for (i = 1, j = 0; i < len - 1; i += 2, j++) { + tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2); + } + if (!(len & 1)) { + tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1); + } + + /* Even */ + tmp[0] = in_even[0] + tmp[1]; + for (i = 2, j = 1; i < len - 1; i += 2, j++) { + tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1); + } + if (len & 1) { + tmp[len - 1] = in_even[len / 2] + tmp[len - 2]; + } +#else + OPJ_INT32 s1, s2, dc, dn; + + assert(len > 2); + + /* Improved version of the TWO_PASS_VERSION: */ + /* Performs lifting in one single iteration. Saves memory */ + /* accesses and explicit interleaving. */ + + s1 = in_even[1]; + dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); + tmp[0] = in_even[0] + dc; + + for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { + + s2 = in_even[j + 1]; + + dn = in_odd[j] - ((s1 + s2 + 2) >> 2); + tmp[i ] = dc; + tmp[i + 1] = s1 + ((dn + dc) >> 1); + + dc = dn; + s1 = s2; + } + + tmp[i] = dc; + + if (!(len & 1)) { + dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1); + tmp[len - 2] = s1 + ((dn + dc) >> 1); + tmp[len - 1] = dn; + } else { + tmp[len - 1] = s1 + dc; + } +#endif + memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); +} + + +#endif /* !defined(STANDARD_SLOW_VERSION) */ + /* */ -/* Inverse 5-3 wavelet transform in 1-D. */ +/* Inverse 5-3 wavelet transform in 1-D for one row. */ /* */ -static void opj_dwt_decode_1(opj_dwt_t *v) +/* Performs interleave, inverse wavelet transform and copy back to buffer */ +static void opj_idwt53_h(const opj_dwt_t *dwt, + OPJ_INT32* tiledp) { - opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas); +#ifdef STANDARD_SLOW_VERSION + /* For documentation purpose */ + opj_dwt_interleave_h(dwt, tiledp); + opj_dwt_decode_1(dwt); + memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32)); +#else + const OPJ_INT32 sn = dwt->sn; + const OPJ_INT32 len = sn + dwt->dn; + if (dwt->cas == 0) { /* Left-most sample is on even coordinate */ + if (len > 1) { + opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp); + } else { + /* Unmodified value */ + } + } else { /* Left-most sample is on odd coordinate */ + if (len == 1) { + tiledp[0] /= 2; + } else if (len == 2) { + OPJ_INT32* out = dwt->mem; + const OPJ_INT32* in_even = &tiledp[sn]; + const OPJ_INT32* in_odd = &tiledp[0]; + out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); + out[0] = in_even[0] + out[1]; + memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32)); + } else if (len > 2) { + opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp); + } + } +#endif +} + +#if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) + +/* Conveniency macros to improve the readabilty of the formulas */ +#if __AVX2__ +#define VREG __m256i +#define LOAD_CST(x) _mm256_set1_epi32(x) +#define LOAD(x) _mm256_load_si256((const VREG*)(x)) +#define LOADU(x) _mm256_loadu_si256((const VREG*)(x)) +#define STORE(x,y) _mm256_store_si256((VREG*)(x),(y)) +#define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y)) +#define ADD(x,y) _mm256_add_epi32((x),(y)) +#define SUB(x,y) _mm256_sub_epi32((x),(y)) +#define SAR(x,y) _mm256_srai_epi32((x),(y)) +#else +#define VREG __m128i +#define LOAD_CST(x) _mm_set1_epi32(x) +#define LOAD(x) _mm_load_si128((const VREG*)(x)) +#define LOADU(x) _mm_loadu_si128((const VREG*)(x)) +#define STORE(x,y) _mm_store_si128((VREG*)(x),(y)) +#define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y)) +#define ADD(x,y) _mm_add_epi32((x),(y)) +#define SUB(x,y) _mm_sub_epi32((x),(y)) +#define SAR(x,y) _mm_srai_epi32((x),(y)) +#endif +#define ADD3(x,y,z) ADD(ADD(x,y),z) + +static +void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col, + const OPJ_INT32* tmp, + OPJ_INT32 len, + OPJ_INT32 stride) +{ + OPJ_INT32 i; + for (i = 0; i < len; ++i) { + /* A memcpy(&tiledp_col[i * stride + 0], + &tmp[PARALLEL_COLS_53 * i + 0], + PARALLEL_COLS_53 * sizeof(OPJ_INT32)) + would do but would be a tiny bit slower. + We can take here advantage of our knowledge of alignment */ + STOREU(&tiledp_col[i * stride + 0], + LOAD(&tmp[PARALLEL_COLS_53 * i + 0])); + STOREU(&tiledp_col[i * stride + VREG_INT_COUNT], + LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT])); + } } +/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or + * 16 in AVX2, when top-most pixel is on even coordinate */ +static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( + OPJ_INT32* tmp, + const OPJ_INT32 sn, + const OPJ_INT32 len, + OPJ_INT32* tiledp_col, + const OPJ_INT32 stride) +{ + const OPJ_INT32* in_even = &tiledp_col[0]; + const OPJ_INT32* in_odd = &tiledp_col[sn * stride]; + + OPJ_INT32 i, j; + VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0; + VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1; + const VREG two = LOAD_CST(2); + + assert(len > 1); +#if __AVX2__ + assert(PARALLEL_COLS_53 == 16); + assert(VREG_INT_COUNT == 8); +#else + assert(PARALLEL_COLS_53 == 8); + assert(VREG_INT_COUNT == 4); +#endif + + /* Note: loads of input even/odd values must be done in a unaligned */ + /* fashion. But stores in tmp can be done with aligned store, since */ + /* the temporary buffer is properly aligned */ + assert((size_t)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); + + s1n_0 = LOADU(in_even + 0); + s1n_1 = LOADU(in_even + VREG_INT_COUNT); + d1n_0 = LOADU(in_odd); + d1n_1 = LOADU(in_odd + VREG_INT_COUNT); + + /* s0n = s1n - ((d1n + 1) >> 1); <==> */ + /* s0n = s1n - ((d1n + d1n + 2) >> 2); */ + s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); + s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); + + for (i = 0, j = 1; i < (len - 3); i += 2, j++) { + d1c_0 = d1n_0; + s0c_0 = s0n_0; + d1c_1 = d1n_1; + s0c_1 = s0n_1; + + s1n_0 = LOADU(in_even + j * stride); + s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT); + d1n_0 = LOADU(in_odd + j * stride); + d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT); + + /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/ + s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2)); + s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2)); + + STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0); + STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1); + + /* d1c + ((s0c + s0n) >> 1) */ + STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, + ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1))); + STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, + ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1))); + } + + STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0); + STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1); + + if (len & 1) { + VREG tmp_len_minus_1; + s1n_0 = LOADU(in_even + ((len - 1) / 2) * stride); + /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ + tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); + STORE(tmp + 8 * (len - 1), tmp_len_minus_1); + /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ + STORE(tmp + 8 * (len - 2), + ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1))); + + s1n_1 = LOADU(in_even + ((len - 1) / 2) * stride + VREG_INT_COUNT); + /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ + tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, + tmp_len_minus_1); + /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ + STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, + ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1))); + + + } else { + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, + ADD(d1n_0, s0n_0)); + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, + ADD(d1n_1, s0n_1)); + } + + opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); +} + + +/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or + * 16 in AVX2, when top-most pixel is on odd coordinate */ +static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( + OPJ_INT32* tmp, + const OPJ_INT32 sn, + const OPJ_INT32 len, + OPJ_INT32* tiledp_col, + const OPJ_INT32 stride) +{ + OPJ_INT32 i, j; + + VREG s1_0, s2_0, dc_0, dn_0; + VREG s1_1, s2_1, dc_1, dn_1; + const VREG two = LOAD_CST(2); + + const OPJ_INT32* in_even = &tiledp_col[sn * stride]; + const OPJ_INT32* in_odd = &tiledp_col[0]; + + assert(len > 2); +#if __AVX2__ + assert(PARALLEL_COLS_53 == 16); + assert(VREG_INT_COUNT == 8); +#else + assert(PARALLEL_COLS_53 == 8); + assert(VREG_INT_COUNT == 4); +#endif + + /* Note: loads of input even/odd values must be done in a unaligned */ + /* fashion. But stores in tmp can be done with aligned store, since */ + /* the temporary buffer is properly aligned */ + assert((size_t)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); + + s1_0 = LOADU(in_even + stride); + /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ + dc_0 = SUB(LOADU(in_odd + 0), + SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2)); + STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0)); + + s1_1 = LOADU(in_even + stride + VREG_INT_COUNT); + /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ + dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT), + SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2)); + STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT, + ADD(LOADU(in_even + VREG_INT_COUNT), dc_1)); + + for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { + + s2_0 = LOADU(in_even + (j + 1) * stride); + s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT); + + /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */ + dn_0 = SUB(LOADU(in_odd + j * stride), + SAR(ADD3(s1_0, s2_0, two), 2)); + dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT), + SAR(ADD3(s1_1, s2_1, two), 2)); + + STORE(tmp + PARALLEL_COLS_53 * i, dc_0); + STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); + + /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */ + STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, + ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); + STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, + ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); + + dc_0 = dn_0; + s1_0 = s2_0; + dc_1 = dn_1; + s1_1 = s2_1; + } + STORE(tmp + PARALLEL_COLS_53 * i, dc_0); + STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); + + if (!(len & 1)) { + /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */ + dn_0 = SUB(LOADU(in_odd + (len / 2 - 1) * stride), + SAR(ADD3(s1_0, s1_0, two), 2)); + dn_1 = SUB(LOADU(in_odd + (len / 2 - 1) * stride + VREG_INT_COUNT), + SAR(ADD3(s1_1, s1_1, two), 2)); + + /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */ + STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0, + ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); + STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, + ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); + + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0); + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1); + } else { + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0)); + STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, + ADD(s1_1, dc_1)); + } + + opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); +} + +#undef VREG +#undef LOAD_CST +#undef LOADU +#undef LOAD +#undef STORE +#undef STOREU +#undef ADD +#undef ADD3 +#undef SUB +#undef SAR + +#endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */ + +#if !defined(STANDARD_SLOW_VERSION) +/** Vertical inverse 5x3 wavelet transform for one column, when top-most + * pixel is on even coordinate */ +static void opj_idwt3_v_cas0(OPJ_INT32* tmp, + const OPJ_INT32 sn, + const OPJ_INT32 len, + OPJ_INT32* tiledp_col, + const OPJ_INT32 stride) +{ + OPJ_INT32 i, j; + OPJ_INT32 d1c, d1n, s1n, s0c, s0n; + + assert(len > 1); + + /* Performs lifting in one single iteration. Saves memory */ + /* accesses and explicit interleaving. */ + + s1n = tiledp_col[0]; + d1n = tiledp_col[sn * stride]; + s0n = s1n - ((d1n + 1) >> 1); + + for (i = 0, j = 0; i < (len - 3); i += 2, j++) { + d1c = d1n; + s0c = s0n; + + s1n = tiledp_col[(j + 1) * stride]; + d1n = tiledp_col[(sn + j + 1) * stride]; + + s0n = s1n - ((d1c + d1n + 2) >> 2); + + tmp[i ] = s0c; + tmp[i + 1] = d1c + ((s0c + s0n) >> 1); + } + + tmp[i] = s0n; + + if (len & 1) { + tmp[len - 1] = + tiledp_col[((len - 1) / 2) * stride] - + ((d1n + 1) >> 1); + tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); + } else { + tmp[len - 1] = d1n + s0n; + } + + for (i = 0; i < len; ++i) { + tiledp_col[i * stride] = tmp[i]; + } +} + + +/** Vertical inverse 5x3 wavelet transform for one column, when top-most + * pixel is on odd coordinate */ +static void opj_idwt3_v_cas1(OPJ_INT32* tmp, + const OPJ_INT32 sn, + const OPJ_INT32 len, + OPJ_INT32* tiledp_col, + const OPJ_INT32 stride) +{ + OPJ_INT32 i, j; + OPJ_INT32 s1, s2, dc, dn; + const OPJ_INT32* in_even = &tiledp_col[sn * stride]; + const OPJ_INT32* in_odd = &tiledp_col[0]; + + assert(len > 2); + + /* Performs lifting in one single iteration. Saves memory */ + /* accesses and explicit interleaving. */ + + s1 = in_even[stride]; + dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); + tmp[0] = in_even[0] + dc; + for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { + + s2 = in_even[(j + 1) * stride]; + + dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); + tmp[i ] = dc; + tmp[i + 1] = s1 + ((dn + dc) >> 1); + + dc = dn; + s1 = s2; + } + tmp[i] = dc; + if (!(len & 1)) { + dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); + tmp[len - 2] = s1 + ((dn + dc) >> 1); + tmp[len - 1] = dn; + } else { + tmp[len - 1] = s1 + dc; + } + + for (i = 0; i < len; ++i) { + tiledp_col[i * stride] = tmp[i]; + } +} +#endif /* !defined(STANDARD_SLOW_VERSION) */ + +/* */ +/* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */ +/* */ +/* Performs interleave, inverse wavelet transform and copy back to buffer */ +static void opj_idwt53_v(const opj_dwt_t *dwt, + OPJ_INT32* tiledp_col, + OPJ_INT32 stride, + OPJ_INT32 nb_cols) +{ +#ifdef STANDARD_SLOW_VERSION + /* For documentation purpose */ + OPJ_INT32 k, c; + for (c = 0; c < nb_cols; c ++) { + opj_dwt_interleave_v(dwt, tiledp_col + c, stride); + opj_dwt_decode_1(dwt); + for (k = 0; k < dwt->sn + dwt->dn; ++k) { + tiledp_col[c + k * stride] = dwt->mem[k]; + } + } +#else + const OPJ_INT32 sn = dwt->sn; + const OPJ_INT32 len = sn + dwt->dn; + if (dwt->cas == 0) { + /* If len == 1, unmodified value */ + +#if (defined(__SSE2__) || defined(__AVX2__)) + if (len > 1 && nb_cols == PARALLEL_COLS_53) { + /* Same as below general case, except that thanks to SSE2/AVX2 */ + /* we can efficently process 8/16 columns in parallel */ + opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); + return; + } +#endif + if (len > 1) { + OPJ_INT32 c; + for (c = 0; c < nb_cols; c++, tiledp_col++) { + opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride); + } + return; + } + } else { + if (len == 1) { + OPJ_INT32 c; + for (c = 0; c < nb_cols; c++, tiledp_col++) { + tiledp_col[0] /= 2; + } + return; + } + + if (len == 2) { + OPJ_INT32 c; + OPJ_INT32* out = dwt->mem; + for (c = 0; c < nb_cols; c++, tiledp_col++) { + OPJ_INT32 i; + const OPJ_INT32* in_even = &tiledp_col[sn * stride]; + const OPJ_INT32* in_odd = &tiledp_col[0]; + + out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); + out[0] = in_even[0] + out[1]; + + for (i = 0; i < len; ++i) { + tiledp_col[i * stride] = out[i]; + } + } + + return; + } + +#if (defined(__SSE2__) || defined(__AVX2__)) + if (len > 2 && nb_cols == PARALLEL_COLS_53) { + /* Same as below general case, except that thanks to SSE2/AVX2 */ + /* we can efficently process 8/16 columns in parallel */ + opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); + return; + } +#endif + if (len > 2) { + OPJ_INT32 c; + for (c = 0; c < nb_cols; c++, tiledp_col++) { + opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride); + } + return; + } + } +#endif +} + + /* */ /* Forward 9-7 wavelet transform in 1-D. */ /* */ @@ -542,7 +1179,7 @@ OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec) OPJ_BOOL opj_dwt_decode(opj_thread_pool_t* tp, opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) { - return opj_dwt_decode_tile(tp, tilec, numres, &opj_dwt_decode_1); + return opj_dwt_decode_tile(tp, tilec, numres); } @@ -639,7 +1276,6 @@ static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, typedef struct { opj_dwt_t h; - DWT1DFN dwt_1D; OPJ_UINT32 rw; OPJ_UINT32 w; OPJ_INT32 * OPJ_RESTRICT tiledp; @@ -655,9 +1291,7 @@ static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) job = (opj_dwd_decode_h_job_t*)user_data; for (j = job->min_j; j < job->max_j; j++) { - opj_dwt_interleave_h(&job->h, &job->tiledp[j * job->w]); - (job->dwt_1D)(&job->h); - memcpy(&job->tiledp[j * job->w], job->h.mem, job->rw * sizeof(OPJ_INT32)); + opj_idwt53_h(&job->h, &job->tiledp[j * job->w]); } opj_aligned_free(job->h.mem); @@ -666,7 +1300,6 @@ static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) typedef struct { opj_dwt_t v; - DWT1DFN dwt_1D; OPJ_UINT32 rh; OPJ_UINT32 w; OPJ_INT32 * OPJ_RESTRICT tiledp; @@ -681,14 +1314,14 @@ static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) (void)tls; job = (opj_dwd_decode_v_job_t*)user_data; - for (j = job->min_j; j < job->max_j; j++) { - OPJ_UINT32 k; - opj_dwt_interleave_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w); - (job->dwt_1D)(&job->v); - for (k = 0; k < job->rh; ++k) { - job->tiledp[k * job->w + j] = job->v.mem[k]; - } + for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j; + j += PARALLEL_COLS_53) { + opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w, + PARALLEL_COLS_53); } + if (j < job->max_j) + opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_INT32)job->w, + (OPJ_INT32)(job->max_j - j)); opj_aligned_free(job->v.mem); opj_free(job); @@ -699,7 +1332,7 @@ static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) /* Inverse wavelet transform in 2-D. */ /* */ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, - opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) + opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) { opj_dwt_t h; opj_dwt_t v; @@ -721,12 +1354,15 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, num_threads = opj_thread_pool_get_thread_count(tp); h_mem_size = opj_dwt_max_resolution(tr, numres); /* overflow check */ - if (h_mem_size > (SIZE_MAX / sizeof(OPJ_INT32))) { + if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) { /* FIXME event manager error callback */ return OPJ_FALSE; } - h_mem_size *= sizeof(OPJ_INT32); - h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size); + /* We need PARALLEL_COLS_53 times the height of the array, */ + /* since for the vertical pass */ + /* we process PARALLEL_COLS_53 columns at a time */ + h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32); + h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); if (! h.mem) { /* FIXME event manager error callback */ return OPJ_FALSE; @@ -750,9 +1386,7 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, if (num_threads <= 1 || rh <= 1) { for (j = 0; j < rh; ++j) { - opj_dwt_interleave_h(&h, &tiledp[j * w]); - (dwt_1D)(&h); - memcpy(&tiledp[j * w], h.mem, rw * sizeof(OPJ_INT32)); + opj_idwt53_h(&h, &tiledp[j * w]); } } else { OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; @@ -777,7 +1411,6 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, return OPJ_FALSE; } job->h = h; - job->dwt_1D = dwt_1D; job->rw = rw; job->w = w; job->tiledp = tiledp; @@ -786,7 +1419,7 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ job->max_j = rh; } - job->h.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size); + job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); if (!job->h.mem) { /* FIXME event manager error callback */ opj_thread_pool_wait_completion(tp, 0); @@ -803,14 +1436,12 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, v.cas = tr->y0 % 2; if (num_threads <= 1 || rw <= 1) { - for (j = 0; j < rw; ++j) { - OPJ_UINT32 k; - - opj_dwt_interleave_v(&v, &tiledp[j], (OPJ_INT32)w); - (dwt_1D)(&v); - for (k = 0; k < rh; ++k) { - tiledp[k * w + j] = v.mem[k]; - } + for (j = 0; j + PARALLEL_COLS_53 <= rw; + j += PARALLEL_COLS_53) { + opj_idwt53_v(&v, &tiledp[j], (OPJ_INT32)w, PARALLEL_COLS_53); + } + if (j < rw) { + opj_idwt53_v(&v, &tiledp[j], (OPJ_INT32)w, (OPJ_INT32)(rw - j)); } } else { OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; @@ -835,7 +1466,6 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, return OPJ_FALSE; } job->v = v; - job->dwt_1D = dwt_1D; job->rh = rh; job->w = w; job->tiledp = tiledp; @@ -844,7 +1474,7 @@ static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ job->max_j = rw; } - job->v.mem = (OPJ_INT32*)opj_aligned_malloc(h_mem_size); + job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); if (!job->v.mem) { /* FIXME event manager error callback */ opj_thread_pool_wait_completion(tp, 0); diff --git a/src/lib/openjp2/opj_includes.h b/src/lib/openjp2/opj_includes.h index 49aa33227..b33e63cef 100644 --- a/src/lib/openjp2/opj_includes.h +++ b/src/lib/openjp2/opj_includes.h @@ -187,6 +187,32 @@ static INLINE long opj_lrintf(float f) # pragma intrinsic(__emul) #endif +/* Apparently Visual Studio doesn't define __SSE__ / __SSE2__ macros */ +#if defined(_M_X64) +/* Intel 64bit support SSE and SSE2 */ +# ifndef __SSE__ +# define __SSE__ 1 +# endif +# ifndef __SSE2__ +# define __SSE2__ 1 +# endif +#endif + +/* For x86, test the value of the _M_IX86_FP macro. */ +/* See https://msdn.microsoft.com/en-us/library/b0084kay.aspx */ +#if defined(_M_IX86_FP) +# if _M_IX86_FP >= 1 +# ifndef __SSE__ +# define __SSE__ 1 +# endif +# endif +# if _M_IX86_FP >= 2 +# ifndef __SSE2__ +# define __SSE2__ 1 +# endif +# endif +#endif + /* Type to use for bit-fields in internal headers */ typedef unsigned int OPJ_BITFIELD; diff --git a/src/lib/openjp2/opj_malloc.c b/src/lib/openjp2/opj_malloc.c index 9c438bdb0..dca91bfcb 100644 --- a/src/lib/openjp2/opj_malloc.c +++ b/src/lib/openjp2/opj_malloc.c @@ -213,6 +213,15 @@ void * opj_aligned_realloc(void *ptr, size_t size) return opj_aligned_realloc_n(ptr, 16U, size); } +void *opj_aligned_32_malloc(size_t size) +{ + return opj_aligned_alloc_n(32U, size); +} +void * opj_aligned_32_realloc(void *ptr, size_t size) +{ + return opj_aligned_realloc_n(ptr, 32U, size); +} + void opj_aligned_free(void* ptr) { #if defined(OPJ_HAVE_POSIX_MEMALIGN) || defined(OPJ_HAVE_MEMALIGN) diff --git a/src/lib/openjp2/opj_malloc.h b/src/lib/openjp2/opj_malloc.h index c8c2fc2db..7503c28d2 100644 --- a/src/lib/openjp2/opj_malloc.h +++ b/src/lib/openjp2/opj_malloc.h @@ -71,6 +71,14 @@ void * opj_aligned_malloc(size_t size); void * opj_aligned_realloc(void *ptr, size_t size); void opj_aligned_free(void* ptr); +/** +Allocate memory aligned to a 32 byte boundary +@param size Bytes to allocate +@return Returns a void pointer to the allocated space, or NULL if there is insufficient memory available +*/ +void * opj_aligned_32_malloc(size_t size); +void * opj_aligned_32_realloc(void *ptr, size_t size); + /** Reallocate memory blocks. @param m Pointer to previously allocated memory block diff --git a/tools/ctest_scripts/travis-ci.cmake b/tools/ctest_scripts/travis-ci.cmake index 828b5af4d..75ed6f6bf 100644 --- a/tools/ctest_scripts/travis-ci.cmake +++ b/tools/ctest_scripts/travis-ci.cmake @@ -53,6 +53,11 @@ if (NOT "$ENV{OPJ_CI_ARCH}" STREQUAL "") endif() endif() +if (NOT "$ENV{OPJ_CI_INSTRUCTION_SETS}" STREQUAL "") + set(CCFLAGS_ARCH "${CCFLAGS_ARCH} $ENV{OPJ_CI_INSTRUCTION_SETS}") +endif() + + if ("$ENV{OPJ_CI_ASAN}" STREQUAL "1") set(OPJ_HAS_MEMCHECK TRUE) set(CTEST_MEMORYCHECK_TYPE "AddressSanitizer")