Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Single-threaded performance improvements in forward DWT for 5-3 and 9-7 (and other improvements) #1253

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
97eb7e0
Add multithreading support in the T1 (entropy phase) encoder
rouault Apr 29, 2020
07d1f77
Add multithreaded support in the DWT encoder.
rouault Apr 30, 2020
99107d5
dwt.c: change sign of constants to match standard and compensate (no …
rouault May 18, 2020
00cff6f
Encoder: use floating-point operations for irreversible transformation
rouault May 18, 2020
3d35d0f
tcd.c: add comment
rouault May 18, 2020
c2b9d09
compare_images.c: code reformatting
rouault May 19, 2020
fe4c15f
Testing: revise testing of lossy encoding by comparing PEAK and MSE w…
rouault May 19, 2020
c6a413a
opj_mct_encode_real(): add SSE optimization
rouault May 19, 2020
4ab2ed0
opj_j2k_setup_encoder(): add validation of tile width and height to a…
rouault May 19, 2020
e46e300
opj_dwt_encode_1_real(): avoid many bound comparisons, similarly to d…
rouault May 20, 2020
f38c069
Irreversible decoding: align code more closely to the standard by avo…
rouault May 20, 2020
3cd1305
Irreversible compression/decompression DWT: use 1/K constant as per s…
rouault May 20, 2020
adccbc8
Irreversible decoding: partially revert previous commit, to fix failu…
rouault May 20, 2020
0c09062
bench_dwt.c: add a -I switch to test irreversible FWDT/IDWT
rouault May 20, 2020
47943da
Speed-up 9x7 IDWD by ~20%
rouault May 21, 2020
272b3e0
Remove useless + 5U margin in opj_dwt_decode_tile_97()
rouault May 21, 2020
45a3522
Speed-up 9x7 IDWD by ~30% with OPJ_NUM_THREADS=2
rouault May 21, 2020
bd5f5ee
Forward DWT: small code refactoring to allow future improvements for …
rouault May 21, 2020
97b384a
Forward DWT 5x3: performance improvements in horizontal pass, and mod…
rouault May 22, 2020
33d3d0d
dwt.c: remove unused typedef
rouault May 22, 2020
e69fa09
Forward DWT: small code refactoring to allow future improvements for …
rouault May 22, 2020
a38e970
Forward DWT 5-3: major speed up by vectorizing vertical pass
rouault May 22, 2020
1e931fd
Forward DWT 9-7: major speed up by vectorizing vertical pass
rouault May 22, 2020
1c5627e
T1 encoder: speed-up by aggressive inlining and more cache friendly d…
rouault May 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ if(BUILD_JPIP_SERVER)
endif()
add_subdirectory(src/lib)
option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF)
if(UNIX)
option(BUILD_UNIT_TESTS "Build unit tests (bench_dwt, test_sparse_array, etc..)" OFF)
endif()

#-----------------------------------------------------------------------------
# Build Applications
Expand Down
35 changes: 32 additions & 3 deletions src/bin/jp2/opj_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,10 @@ static void encode_help_display(void)
fprintf(stdout, " Currently supports only RPCL order.\n");
fprintf(stdout, "-C <comment>\n");
fprintf(stdout, " Add <comment> in the comment marker segment.\n");
if (opj_has_thread_support()) {
fprintf(stdout, " -threads <num_threads|ALL_CPUS>\n"
" Number of threads to use for encoding or ALL_CPUS for all available cores.\n");
}
/* UniPG>> */
#ifdef USE_JPWL
fprintf(stdout, "-W <params>\n");
Expand Down Expand Up @@ -579,7 +583,8 @@ static int parse_cmdline_encoder(int argc, char **argv,
img_fol_t *img_fol, raw_cparameters_t *raw_cp, char *indexfilename,
size_t indexfilename_size,
int* pOutFramerate,
OPJ_BOOL* pOutPLT)
OPJ_BOOL* pOutPLT,
int* pOutNumThreads)
{
OPJ_UINT32 i, j;
int totlen, c;
Expand All @@ -596,7 +601,8 @@ static int parse_cmdline_encoder(int argc, char **argv,
{"jpip", NO_ARG, NULL, 'J'},
{"mct", REQ_ARG, NULL, 'Y'},
{"IMF", REQ_ARG, NULL, 'Z'},
{"PLT", NO_ARG, NULL, 'A'}
{"PLT", NO_ARG, NULL, 'A'},
{"threads", REQ_ARG, NULL, 'B'}
};

/* parse the command line */
Expand Down Expand Up @@ -1679,6 +1685,19 @@ static int parse_cmdline_encoder(int argc, char **argv,
}
break;

/* ----------------------------------------------------- */
case 'B': { /* Number of threads */
if (strcmp(opj_optarg, "ALL_CPUS") == 0) {
*pOutNumThreads = opj_get_num_cpus();
if (*pOutNumThreads == 1) {
*pOutNumThreads = 0;
}
} else {
sscanf(opj_optarg, "%d", pOutNumThreads);
}
}
break;

/* ------------------------------------------------------ */


Expand Down Expand Up @@ -1860,6 +1879,7 @@ int main(int argc, char **argv)
OPJ_FLOAT64 t = opj_clock();

OPJ_BOOL PLT = OPJ_FALSE;
int num_threads = 0;

/* set encoding parameters to default values */
opj_set_default_encoder_parameters(&parameters);
Expand All @@ -1880,7 +1900,7 @@ int main(int argc, char **argv)
parameters.tcp_mct = (char)
255; /* This will be set later according to the input image or the provided option */
if (parse_cmdline_encoder(argc, argv, &parameters, &img_fol, &raw_cp,
indexfilename, sizeof(indexfilename), &framerate, &PLT) == 1) {
indexfilename, sizeof(indexfilename), &framerate, &PLT, &num_threads) == 1) {
ret = 1;
goto fin;
}
Expand Down Expand Up @@ -2141,6 +2161,15 @@ int main(int argc, char **argv)
}
}

if (num_threads >= 1 &&
!opj_codec_set_threads(l_codec, num_threads)) {
fprintf(stderr, "failed to set number of threads\n");
opj_destroy_codec(l_codec);
opj_image_destroy(image);
ret = 1;
goto fin;
}

/* open a byte stream for writing and allocate memory for all tiles */
l_stream = opj_stream_create_default_file_stream(parameters.outfile, OPJ_FALSE);
if (! l_stream) {
Expand Down
4 changes: 2 additions & 2 deletions src/lib/openjp2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT})
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)

if(BUILD_UNIT_TESTS)
if(BUILD_UNIT_TESTS AND UNIX)
add_executable(bench_dwt bench_dwt.c)
if(UNIX)
target_link_libraries(bench_dwt m ${OPENJPEG_LIBRARY_NAME})
Expand All @@ -215,4 +215,4 @@ if(BUILD_UNIT_TESTS)
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
target_link_libraries(test_sparse_array ${CMAKE_THREAD_LIBS_INIT})
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
endif(BUILD_UNIT_TESTS)
endif(BUILD_UNIT_TESTS AND UNIX)
140 changes: 110 additions & 30 deletions src/lib/openjp2/bench_dwt.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ void init_tilec(opj_tcd_tilecomp_t * l_tilec,
OPJ_INT32 y0,
OPJ_INT32 x1,
OPJ_INT32 y1,
OPJ_UINT32 numresolutions)
OPJ_UINT32 numresolutions,
OPJ_BOOL irreversible)
{
opj_tcd_resolution_t* l_res;
OPJ_UINT32 resno, l_level_no;
Expand All @@ -64,9 +65,16 @@ void init_tilec(opj_tcd_tilecomp_t * l_tilec,
(size_t)(l_tilec->y1 - l_tilec->y0);
l_tilec->data = (OPJ_INT32*) opj_malloc(sizeof(OPJ_INT32) * nValues);
for (i = 0; i < nValues; i++) {
l_tilec->data[i] = getValue((OPJ_UINT32)i);
OPJ_INT32 val = getValue((OPJ_UINT32)i);
if (irreversible) {
OPJ_FLOAT32 fVal = (OPJ_FLOAT32)val;
memcpy(&l_tilec->data[i], &fVal, sizeof(OPJ_FLOAT32));
} else {
l_tilec->data[i] = val;
}
}
l_tilec->numresolutions = numresolutions;
l_tilec->minimum_num_resolutions = numresolutions;
l_tilec->resolutions = (opj_tcd_resolution_t*) opj_calloc(
l_tilec->numresolutions,
sizeof(opj_tcd_resolution_t));
Expand Down Expand Up @@ -98,9 +106,9 @@ void free_tilec(opj_tcd_tilecomp_t * l_tilec)
void usage(void)
{
printf(
"bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n");
"bench_dwt [-decode|encode] [-I] [-size value] [-check] [-display]\n");
printf(
" [-offset x y] [-num_threads val]\n");
" [-num_resolutions val] [-offset x y] [-num_threads val]\n");
exit(1);
}

Expand Down Expand Up @@ -131,6 +139,17 @@ OPJ_FLOAT64 opj_clock(void)
#endif
}

static OPJ_FLOAT64 opj_wallclock(void)
{
#ifdef _WIN32
return opj_clock();
#else
struct timeval tv;
gettimeofday(&tv, NULL);
return (OPJ_FLOAT64)tv.tv_sec + 1e-6 * (OPJ_FLOAT64)tv.tv_usec;
#endif
}

int main(int argc, char** argv)
{
int num_threads = 0;
Expand All @@ -146,16 +165,24 @@ int main(int argc, char** argv)
OPJ_BOOL check = OPJ_FALSE;
OPJ_INT32 size = 16384 - 1;
OPJ_FLOAT64 start, stop;
OPJ_FLOAT64 start_wc, stop_wc;
OPJ_UINT32 offset_x = ((OPJ_UINT32)size + 1) / 2 - 1;
OPJ_UINT32 offset_y = ((OPJ_UINT32)size + 1) / 2 - 1;
OPJ_UINT32 num_resolutions = 6;
OPJ_BOOL bench_decode = OPJ_TRUE;
OPJ_BOOL irreversible = OPJ_FALSE;

for (i = 1; i < argc; i++) {
if (strcmp(argv[i], "-display") == 0) {
if (strcmp(argv[i], "-encode") == 0) {
bench_decode = OPJ_FALSE;
} else if (strcmp(argv[i], "-decode") == 0) {
bench_decode = OPJ_TRUE;
} else if (strcmp(argv[i], "-display") == 0) {
display = OPJ_TRUE;
check = OPJ_TRUE;
} else if (strcmp(argv[i], "-check") == 0) {
check = OPJ_TRUE;
} else if (strcmp(argv[i], "-I") == 0) {
irreversible = OPJ_TRUE;
} else if (strcmp(argv[i], "-size") == 0 && i + 1 < argc) {
size = atoi(argv[i + 1]);
i ++;
Expand All @@ -179,18 +206,29 @@ int main(int argc, char** argv)
}
}

if (irreversible && check) {
/* Due to irreversible inverse DWT not being symetric of forward */
/* See BUG_WEIRD_TWO_INVK in dwt.c */
printf("-I and -check aren't compatible\n");
exit(1);
}

tp = opj_thread_pool_create(num_threads);

init_tilec(&tilec, (OPJ_INT32)offset_x, (OPJ_INT32)offset_y,
(OPJ_INT32)offset_x + size, (OPJ_INT32)offset_y + size,
num_resolutions);
num_resolutions, irreversible);

if (display) {
printf("Before\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
if (irreversible) {
printf("%f ", ((OPJ_FLOAT32*)tilec.data)[k]);
} else {
printf("%d ", tilec.data[k]);
}
k ++;
}
printf("\n");
Expand Down Expand Up @@ -223,45 +261,87 @@ int main(int argc, char** argv)
image_comp.dy = 1;

start = opj_clock();
opj_dwt_decode(&tcd, &tilec, tilec.numresolutions);
start_wc = opj_wallclock();
if (bench_decode) {
if (irreversible) {
opj_dwt_decode_real(&tcd, &tilec, tilec.numresolutions);
} else {
opj_dwt_decode(&tcd, &tilec, tilec.numresolutions);
}
} else {
if (irreversible) {
opj_dwt_encode_real(&tcd, &tilec);
} else {
opj_dwt_encode(&tcd, &tilec);
}
}
stop = opj_clock();
printf("time for dwt_decode: %.03f s\n", stop - start);
stop_wc = opj_wallclock();
printf("time for %s: total = %.03f s, wallclock = %.03f s\n",
bench_decode ? "dwt_decode" : "dwt_encode",
stop - start,
stop_wc - start_wc);

if (display || check) {
if (display) {
if (display) {
if (bench_decode) {
printf("After IDWT\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
} else {
printf("After FDWT\n");
}
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
if (irreversible) {
printf("%f ", ((OPJ_FLOAT32*)tilec.data)[k]);
} else {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
k ++;
}
printf("\n");
}
}

opj_dwt_encode(&tilec);
if (display) {
printf("After FDWT\n");
if ((display || check) && !irreversible) {

if (bench_decode) {
opj_dwt_encode(&tcd, &tilec);
} else {
opj_dwt_decode(&tcd, &tilec, tilec.numresolutions);
}


if (display && !irreversible) {
if (bench_decode) {
printf("After FDWT\n");
} else {
printf("After IDWT\n");
}
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
if (irreversible) {
printf("%f ", ((OPJ_FLOAT32*)tilec.data)[k]);
} else {
printf("%d ", tilec.data[k]);
}
k ++;
}
printf("\n");
}
}

if (check) {
size_t idx;
size_t nValues = (size_t)(tilec.x1 - tilec.x0) *
(size_t)(tilec.y1 - tilec.y0);
for (idx = 0; idx < nValues; idx++) {
if (tilec.data[idx] != getValue((OPJ_UINT32)idx)) {
printf("Difference found at idx = %u\n", (OPJ_UINT32)idx);
exit(1);
}
}

if (check) {

size_t idx;
size_t nValues = (size_t)(tilec.x1 - tilec.x0) *
(size_t)(tilec.y1 - tilec.y0);
for (idx = 0; idx < nValues; idx++) {
if (tilec.data[idx] != getValue((OPJ_UINT32)idx)) {
printf("Difference found at idx = %u\n", (OPJ_UINT32)idx);
exit(1);
}
}
}
Expand Down
Loading