diff --git a/debian/patches/0004-add-cuda-tonemap-impl.patch b/debian/patches/0004-add-cuda-tonemap-impl.patch index fa48418e6ba..19ac4be50f1 100644 --- a/debian/patches/0004-add-cuda-tonemap-impl.patch +++ b/debian/patches/0004-add-cuda-tonemap-impl.patch @@ -324,7 +324,7 @@ Index: FFmpeg/libavfilter/cuda/colorspace_common.h =================================================================== --- /dev/null +++ FFmpeg/libavfilter/cuda/colorspace_common.h -@@ -0,0 +1,330 @@ +@@ -0,0 +1,338 @@ +/* + * This file is part of FFmpeg. + * @@ -361,6 +361,10 @@ Index: FFmpeg/libavfilter/cuda/colorspace_common.h +#define ARIB_B67_B 0.28466892f +#define ARIB_B67_C 0.55991073f + ++#define LIMITED_BLACK 0.06256109482f ++#define LIMITED_WHITE 0.9188660802f ++#define LIMITED_RANGE 0.8563049854f ++ +#define FLOAT_EPS 1e-6f + +extern __constant__ const float ref_white; @@ -497,16 +501,17 @@ Index: FFmpeg/libavfilter/cuda/colorspace_common.h +} + +static __inline__ __device__ float3 yuv2rgb(float y, float u, float v) { -+ if (range_src == AVCOL_RANGE_JPEG) { -+ u -= 0.5f; v -= 0.5f; -+ } else { -+ y = (y * 255.0f - 16.0f) / 219.0f; -+ u = (u * 255.0f - 128.0f) / 224.0f; -+ v = (v * 255.0f - 128.0f) / 224.0f; -+ } ++ u -= 0.5f; ++ v -= 0.5f; + float r = y * rgb_matrix[0] + u * rgb_matrix[1] + v * rgb_matrix[2]; + float g = y * rgb_matrix[3] + u * rgb_matrix[4] + v * rgb_matrix[5]; + float b = y * rgb_matrix[6] + u * rgb_matrix[7] + v * rgb_matrix[8]; ++ if (range_src == AVCOL_RANGE_MPEG) { ++ r = (r - LIMITED_BLACK) / LIMITED_RANGE; ++ g = (g - LIMITED_BLACK) / LIMITED_RANGE; ++ b = (b - LIMITED_BLACK) / LIMITED_RANGE; ++ } ++ + return make_float3(r, g, b); +} + @@ -518,23 +523,26 @@ Index: FFmpeg/libavfilter/cuda/colorspace_common.h +} + +static __inline__ __device__ float3 rgb2yuv(float r, float g, float b) { ++ if (range_dst == AVCOL_RANGE_MPEG) { ++ r = r * LIMITED_RANGE + LIMITED_BLACK; ++ g = g * LIMITED_RANGE + LIMITED_BLACK; ++ b = b * LIMITED_RANGE + LIMITED_BLACK; ++ } + float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2]; + float u = r*yuv_matrix[3] + g*yuv_matrix[4] + b*yuv_matrix[5]; + float v = r*yuv_matrix[6] + g*yuv_matrix[7] + b*yuv_matrix[8]; -+ if (range_dst == AVCOL_RANGE_JPEG) { -+ u += 0.5f; v += 0.5f; -+ } else { -+ y = (219.0f * y + 16.0f) / 255.0f; -+ u = (224.0f * u + 128.0f) / 255.0f; -+ v = (224.0f * v + 128.0f) / 255.0f; -+ } ++ u += 0.5f; ++ v += 0.5f; + return make_float3(y, u, v); +} + +static __inline__ __device__ float rgb2y(float r, float g, float b) { ++ if (range_dst == AVCOL_RANGE_MPEG) { ++ r = r * LIMITED_RANGE + LIMITED_BLACK; ++ g = g * LIMITED_RANGE + LIMITED_BLACK; ++ b = b * LIMITED_RANGE + LIMITED_BLACK; ++ } + float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2]; -+ if (range_dst != AVCOL_RANGE_JPEG) -+ y = (219.0f * y + 16.0f) / 255.0f; + return y; +} + @@ -1767,7 +1775,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_cuda.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemap_cuda.c -@@ -0,0 +1,1127 @@ +@@ -0,0 +1,1123 @@ +/* + * This file is part of FFmpeg. + * @@ -1877,7 +1885,6 @@ Index: FFmpeg/libavfilter/vf_tonemap_cuda.c + int apply_dovi; + int tradeoff; + int init_with_dovi; -+ double ref_white; + double param; + double desat_param; + double peak; @@ -2311,10 +2318,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_cuda.c + if (isnan(s->param)) + s->param = 1.0f; + -+ s->ref_white = s->tonemap == TONEMAP_BT2390 ? REFERENCE_WHITE_ALT -+ : REFERENCE_WHITE; -+ -+ if (s->tonemap == TONEMAP_BT2390 && s->peak) ++ if (s->peak) + s->peak = FFMAX(s->peak / 10.0f, 1.1f); + + s->dst_peak = 1.0f; @@ -2418,11 +2422,11 @@ Index: FFmpeg/libavfilter/vf_tonemap_cuda.c + CONSTANT(".u32 enable_dither = %i", (int)(s->in_desc->comp[0].depth > s->out_desc->comp[0].depth)); + CONSTANT(".f32 dither_size = %f", (float)ff_fruit_dither_size); + CONSTANT(".f32 dither_quantization = %f", (float)((1 << s->out_desc->comp[0].depth) - 1)); -+ CONSTANT(".f32 ref_white = %f", s->ref_white); ++ CONSTANT(".f32 ref_white = %f", REFERENCE_WHITE_ALT); + CONSTANT(".f32 tone_param = %f", s->param); + CONSTANT(".f32 desat_param = %f", s->desat_param); -+ CONSTANT(".f32 pq_max_lum_div_ref_white = %f", (float)(ST2084_MAX_LUMINANCE / s->ref_white)); -+ CONSTANT(".f32 ref_white_div_pq_max_lum = %f", (float)(s->ref_white / ST2084_MAX_LUMINANCE)); ++ CONSTANT(".f32 pq_max_lum_div_ref_white = %f", (float)(ST2084_MAX_LUMINANCE / REFERENCE_WHITE_ALT)); ++ CONSTANT(".f32 ref_white_div_pq_max_lum = %f", (float)(REFERENCE_WHITE_ALT / ST2084_MAX_LUMINANCE)); + CONSTANT_M("rgb_matrix", (s->dovi ? s->dovi->nonlinear : rgb_matrix)); + CONSTANT_M("yuv_matrix", yuv_matrix); + CONSTANT_A(".u8 rgb2rgb_passthrough = %i", 1, in_pri == out_pri); diff --git a/debian/patches/0007-add-bt2390-eetf-and-code-refactor-to-opencl-tonemap.patch b/debian/patches/0007-add-bt2390-eetf-and-code-refactor-to-opencl-tonemap.patch index 08ce1bdbecc..1b411fbf915 100644 --- a/debian/patches/0007-add-bt2390-eetf-and-code-refactor-to-opencl-tonemap.patch +++ b/debian/patches/0007-add-bt2390-eetf-and-code-refactor-to-opencl-tonemap.patch @@ -65,7 +65,7 @@ Index: FFmpeg/libavfilter/opencl/colorspace_common.cl =================================================================== --- FFmpeg.orig/libavfilter/opencl/colorspace_common.cl +++ FFmpeg/libavfilter/opencl/colorspace_common.cl -@@ -17,7 +17,17 @@ +@@ -17,7 +17,21 @@ */ #define ST2084_MAX_LUMINANCE 10000.0f @@ -80,11 +80,15 @@ Index: FFmpeg/libavfilter/opencl/colorspace_common.cl +#define ARIB_B67_B 0.28466892f +#define ARIB_B67_C 0.55991073f + ++#define LIMITED_BLACK 0.06256109482f ++#define LIMITED_WHITE 0.9188660802f ++#define LIMITED_RANGE 0.8563049854f ++ +#define FLOAT_EPS 1e-6f #if chroma_loc == 1 #define chroma_sample(a,b,c,d) (((a) + (c)) * 0.5f) -@@ -33,81 +43,124 @@ +@@ -33,92 +47,134 @@ #define chroma_sample(a,b,c,d) (((a) + (b) + (c) + (d)) * 0.25f) #endif @@ -183,21 +187,33 @@ Index: FFmpeg/libavfilter/opencl/colorspace_common.cl + float den = 1.0f + ST2084_C3 * xpow; + return native_powr(1.0f + num / den, ST2084_M2); +#endif -+} -+ + } + +-float inverse_eotf_bt1886(float c) { +- return c < 0.0f ? 0.0f : powr(c, 1.0f / 2.4f); +float inverse_eotf_st2084(float x) { + x *= ref_white_div_pq_max_lum; + return inverse_eotf_st2084_common(x); -+} -+ + } + +-float oetf_bt709(float c) { +- c = c < 0.0f ? 0.0f : c; +- float r1 = 4.5f * c; +- float r2 = 1.099f * powr(c, 0.45f) - 0.099f; +- return c < 0.018f ? r1 : r2; +-} +-float inverse_oetf_bt709(float c) { +- float r1 = c / 4.5f; +- float r2 = powr((c + 0.099f) / 1.099f, 1.0f / 0.45f); +- return c < 0.081f ? r1 : r2; +float4 eotf_st2084x4(float4 x) { + x.x = eotf_st2084_common(x.x); + x.y = eotf_st2084_common(x.y); + x.z = eotf_st2084_common(x.z); + x.w = eotf_st2084_common(x.w); + return x * pq_max_lum_div_ref_white; -+} -+ + } + +float4 inverse_eotf_st2084x4(float4 x) { + x *= ref_white_div_pq_max_lum; + x.x = inverse_eotf_st2084_common(x.x); @@ -213,34 +229,22 @@ Index: FFmpeg/libavfilter/opencl/colorspace_common.cl + +float inverse_ootf_1_2(float x) { + return x > 0.0f ? native_powr(x, 1.0f / 1.2f) : x; - } - --float inverse_eotf_bt1886(float c) { -- return c < 0.0f ? 0.0f : powr(c, 1.0f / 2.4f); ++} ++ +float oetf_arib_b67(float x) { + x = fmax(x, 0.0f); + return x <= (1.0f / 12.0f) + ? native_sqrt(3.0f * x) + : (ARIB_B67_A * native_log(12.0f * x - ARIB_B67_B) + ARIB_B67_C); - } - --float oetf_bt709(float c) { -- c = c < 0.0f ? 0.0f : c; -- float r1 = 4.5f * c; -- float r2 = 1.099f * powr(c, 0.45f) - 0.099f; -- return c < 0.018f ? r1 : r2; --} --float inverse_oetf_bt709(float c) { -- float r1 = c / 4.5f; -- float r2 = powr((c + 0.099f) / 1.099f, 1.0f / 0.45f); -- return c < 0.081f ? r1 : r2; ++} ++ +float inverse_oetf_arib_b67(float x) { + x = fmax(x, 0.0f); + return x <= 0.5f + ? (x * x) * (1.0f / 3.0f) + : (native_exp((x - ARIB_B67_C) / ARIB_B67_A) + ARIB_B67_B) * (1.0f / 12.0f); - } - ++} ++ +// linearizer for HLG/ARIB-B67 +float eotf_arib_b67(float x) { + return ootf_1_2(inverse_oetf_arib_b67(x)) * 5.0f; @@ -263,19 +267,58 @@ Index: FFmpeg/libavfilter/opencl/colorspace_common.cl +#endif + float3 yuv2rgb(float y, float u, float v) { - #ifdef FULL_RANGE_IN +-#ifdef FULL_RANGE_IN u -= 0.5f; v -= 0.5f; -@@ -150,7 +203,9 @@ float3 rgb2yuv(float r, float g, float b +-#else +- y = (y * 255.0f - 16.0f) / 219.0f; +- u = (u * 255.0f - 128.0f) / 224.0f; +- v = (v * 255.0f - 128.0f) / 224.0f; +-#endif + float r = y * rgb_matrix[0] + u * rgb_matrix[1] + v * rgb_matrix[2]; + float g = y * rgb_matrix[3] + u * rgb_matrix[4] + v * rgb_matrix[5]; + float b = y * rgb_matrix[6] + u * rgb_matrix[7] + v * rgb_matrix[8]; ++#ifndef FULL_RANGE_IN ++ r = (r - LIMITED_BLACK) / LIMITED_RANGE; ++ g = (g - LIMITED_BLACK) / LIMITED_RANGE; ++ b = (b - LIMITED_BLACK) / LIMITED_RANGE; ++#endif + return (float3)(r, g, b); + } - float rgb2y(float r, float g, float b) { +@@ -135,22 +191,25 @@ float3 yuv2lrgb(float3 yuv) { + } + + float3 rgb2yuv(float r, float g, float b) { ++#ifndef FULL_RANGE_OUT ++ r = r * LIMITED_RANGE + LIMITED_BLACK; ++ g = g * LIMITED_RANGE + LIMITED_BLACK; ++ b = b * LIMITED_RANGE + LIMITED_BLACK; ++#endif float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2]; + float u = r*yuv_matrix[3] + g*yuv_matrix[4] + b*yuv_matrix[5]; + float v = r*yuv_matrix[6] + g*yuv_matrix[7] + b*yuv_matrix[8]; +-#ifdef FULL_RANGE_OUT + u += 0.5f; v += 0.5f; +-#else +- y = (219.0f * y + 16.0f) / 255.0f; +- u = (224.0f * u + 128.0f) / 255.0f; +- v = (224.0f * v + 128.0f) / 255.0f; +-#endif + return (float3)(y, u, v); + } + + float rgb2y(float r, float g, float b) { +#ifndef FULL_RANGE_OUT - y = (219.0f * y + 16.0f) / 255.0f; ++ r = r * LIMITED_RANGE + LIMITED_BLACK; ++ g = g * LIMITED_RANGE + LIMITED_BLACK; ++ b = b * LIMITED_RANGE + LIMITED_BLACK; +#endif + float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2]; +- y = (219.0f * y + 16.0f) / 255.0f; return y; } -@@ -188,18 +243,101 @@ float3 lrgb2lrgb(float3 c) { +@@ -188,18 +247,101 @@ float3 lrgb2lrgb(float3 c) { #endif } @@ -1135,7 +1178,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c }; typedef struct TonemapOpenCLContext { -@@ -56,23 +77,44 @@ typedef struct TonemapOpenCLContext { +@@ -56,23 +77,43 @@ typedef struct TonemapOpenCLContext { enum AVColorPrimaries primaries, primaries_in, primaries_out; enum AVColorRange range, range_in, range_out; enum AVChromaLocation chroma_loc; @@ -1160,7 +1203,6 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c + enum TonemapMode tonemap_mode; enum AVPixelFormat format; + int apply_dovi; -+ double ref_white; double peak; + double target_peak; double param; @@ -1184,7 +1226,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c }; static const char *const delinearize_funcs[AVCOL_TRC_NB] = { -@@ -80,7 +122,7 @@ static const char *const delinearize_fun +@@ -80,7 +121,7 @@ static const char *const delinearize_fun [AVCOL_TRC_BT2020_10] = "inverse_eotf_bt1886", }; @@ -1193,7 +1235,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c [TONEMAP_NONE] = "direct", [TONEMAP_LINEAR] = "linear", [TONEMAP_GAMMA] = "gamma", -@@ -88,8 +130,54 @@ static const char *const tonemap_func[TO +@@ -88,8 +129,54 @@ static const char *const tonemap_func[TO [TONEMAP_REINHARD] = "reinhard", [TONEMAP_HABLE] = "hable", [TONEMAP_MOBIUS] = "mobius", @@ -1226,7 +1268,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c + return AVERROR(ENOMEM); + for (i = 0; i < LUT_SIZE; i++) { + float x = (float)i / (LUT_SIZE - 1); -+ ctx->lin_lut[i] = FFMAX(linearize(x, ctx->ref_white, ctx->trc_in), 0.0f); ++ ctx->lin_lut[i] = FFMAX(linearize(x, REFERENCE_WHITE_ALT, ctx->trc_in), 0.0f); + } + + return 0; @@ -1248,7 +1290,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c static int get_rgb2rgb_matrix(enum AVColorPrimaries in, enum AVColorPrimaries out, double rgb2rgb[3][3]) { double rgb2xyz[3][3], xyz2rgb[3][3]; -@@ -108,23 +196,150 @@ static int get_rgb2rgb_matrix(enum AVCol +@@ -108,23 +195,150 @@ static int get_rgb2rgb_matrix(enum AVCol return 0; } @@ -1408,14 +1450,11 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c switch(ctx->tonemap) { case TONEMAP_GAMMA: -@@ -144,48 +359,170 @@ static int tonemap_opencl_init(AVFilterC +@@ -144,48 +358,167 @@ static int tonemap_opencl_init(AVFilterC if (isnan(ctx->param)) ctx->param = 1.0f; -+ ctx->ref_white = ctx->tonemap == TONEMAP_BT2390 ? REFERENCE_WHITE_ALT -+ : REFERENCE_WHITE; -+ -+ if (ctx->tonemap == TONEMAP_BT2390 && ctx->peak) ++ if (ctx->peak) + ctx->peak = FFMAX(ctx->peak / 10.0f, 1.1f); + // SDR peak is 1.0f @@ -1538,7 +1577,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c + av_bprint_init(&header, 2048, AV_BPRINT_SIZE_UNLIMITED); + + av_bprintf(&header, "__constant float ref_white = %.4ff;\n", -+ ctx->ref_white); ++ REFERENCE_WHITE_ALT); + av_bprintf(&header, "__constant float tone_param = %.4ff;\n", ctx->param); - av_bprintf(&header, "__constant const float desat_param = %.4ff;\n", @@ -1553,9 +1592,9 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c ctx->scene_threshold); + + av_bprintf(&header, "__constant float pq_max_lum_div_ref_white = %ff;\n", -+ (ST2084_MAX_LUMINANCE / ctx->ref_white)); ++ (ST2084_MAX_LUMINANCE / REFERENCE_WHITE_ALT)); + av_bprintf(&header, "__constant float ref_white_div_pq_max_lum = %ff;\n", -+ (ctx->ref_white / ST2084_MAX_LUMINANCE)); ++ (REFERENCE_WHITE_ALT / ST2084_MAX_LUMINANCE)); + av_bprintf(&header, "#define TONE_FUNC %s\n", tonemap_func[ctx->tonemap]); - av_bprintf(&header, "#define DETECTION_FRAMES %d\n", DETECTION_FRAMES); @@ -1594,7 +1633,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c if (ctx->range_in == AVCOL_RANGE_JPEG) av_bprintf(&header, "#define FULL_RANGE_IN\n"); -@@ -199,19 +536,41 @@ static int tonemap_opencl_init(AVFilterC +@@ -199,19 +532,41 @@ static int tonemap_opencl_init(AVFilterC else ff_opencl_print_const_matrix_3x3(&header, "rgb2rgb", rgb2rgb); @@ -1643,7 +1682,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c ctx->colorspace_out, av_color_space_name(ctx->colorspace_out)); goto fail; } -@@ -219,24 +578,23 @@ static int tonemap_opencl_init(AVFilterC +@@ -219,24 +574,23 @@ static int tonemap_opencl_init(AVFilterC ff_fill_rgb2yuv_table(luma_dst, rgb2yuv); ff_opencl_print_const_matrix_3x3(&header, "yuv_matrix", rgb2yuv); @@ -1683,7 +1722,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c av_log(avctx, AV_LOG_DEBUG, "Generated OpenCL header:\n%s\n", header.str); opencl_sources[0] = header.str; -@@ -254,46 +612,171 @@ static int tonemap_opencl_init(AVFilterC +@@ -254,46 +608,171 @@ static int tonemap_opencl_init(AVFilterC CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL " "command queue %d.\n", cle); @@ -1873,7 +1912,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c ret = ff_opencl_filter_config_output(outlink); if (ret < 0) return ret; -@@ -308,13 +791,46 @@ static int launch_kernel(AVFilterContext +@@ -308,13 +787,46 @@ static int launch_kernel(AVFilterContext size_t global_work[2]; size_t local_work[2]; cl_int cle; @@ -1922,7 +1961,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c local_work[0] = 16; local_work[1] = 16; -@@ -338,13 +854,10 @@ static int tonemap_opencl_filter_frame(A +@@ -338,13 +850,10 @@ static int tonemap_opencl_filter_frame(A AVFilterContext *avctx = inlink->dst; AVFilterLink *outlink = avctx->outputs[0]; TonemapOpenCLContext *ctx = avctx->priv; @@ -1937,7 +1976,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n", av_get_pix_fmt_name(input->format), -@@ -363,9 +876,6 @@ static int tonemap_opencl_filter_frame(A +@@ -363,9 +872,6 @@ static int tonemap_opencl_filter_frame(A if (err < 0) goto fail; @@ -1947,7 +1986,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c if (ctx->trc != -1) output->color_trc = ctx->trc; if (ctx->primaries != -1) -@@ -385,72 +895,92 @@ static int tonemap_opencl_filter_frame(A +@@ -385,72 +891,92 @@ static int tonemap_opencl_filter_frame(A ctx->range_out = output->color_range; ctx->chroma_loc = output->chroma_location; @@ -2080,7 +2119,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c av_frame_free(&input); av_frame_free(&output); return err; -@@ -458,24 +988,9 @@ fail: +@@ -458,24 +984,9 @@ fail: static av_cold void tonemap_opencl_uninit(AVFilterContext *avctx) { @@ -2107,7 +2146,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_opencl.c ff_opencl_filter_uninit(avctx); } -@@ -483,37 +998,50 @@ static av_cold void tonemap_opencl_unini +@@ -483,37 +994,50 @@ static av_cold void tonemap_opencl_unini #define OFFSET(x) offsetof(TonemapOpenCLContext, x) #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) static const AVOption tonemap_opencl_options[] = { diff --git a/debian/patches/0052-add-vf-tonemap-videotoolbox-filter.patch b/debian/patches/0052-add-vf-tonemap-videotoolbox-filter.patch index 6b0efa32cbc..064d2ce67e1 100644 --- a/debian/patches/0052-add-vf-tonemap-videotoolbox-filter.patch +++ b/debian/patches/0052-add-vf-tonemap-videotoolbox-filter.patch @@ -40,7 +40,7 @@ Index: FFmpeg/libavfilter/metal/vf_tonemap_videotoolbox.metal =================================================================== --- /dev/null +++ FFmpeg/libavfilter/metal/vf_tonemap_videotoolbox.metal -@@ -0,0 +1,891 @@ +@@ -0,0 +1,894 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -83,6 +83,10 @@ Index: FFmpeg/libavfilter/metal/vf_tonemap_videotoolbox.metal + +#define FLOAT_EPS 1e-6f + ++#define LIMITED_BLACK 0.06256109482f ++#define LIMITED_WHITE 0.9188660802f ++#define LIMITED_RANGE 0.8563049854f ++ +constant float ref_white [[function_constant(0)]]; +constant float tone_param [[function_constant(1)]]; +constant float desat_param [[function_constant(2)]]; @@ -270,18 +274,16 @@ Index: FFmpeg/libavfilter/metal/vf_tonemap_videotoolbox.metal +// ------------ +// Color conversion +float3 yuv2rgb(float y, float u, float v) { -+ if (is_full_range_in) { -+ u -= 0.5f; -+ v -= 0.5f; -+ } else { -+ y = (y * 255.0f - 16.0f) / 219.0f; -+ u = (u * 255.0f - 128.0f) / 224.0f; -+ v = (v * 255.0f - 128.0f) / 224.0f; -+ } ++ u -= 0.5f; ++ v -= 0.5f; + float r = (y * rgb_matrix_1[0]) + (u * rgb_matrix_1[1]) + (v * rgb_matrix_1[2]); + float g = (y * rgb_matrix_2[0]) + (u * rgb_matrix_2[1]) + (v * rgb_matrix_2[2]); + float b = (y * rgb_matrix_3[0]) + (u * rgb_matrix_3[1]) + (v * rgb_matrix_3[2]); -+ return float3(r, g, b); ++ float3 c = float3(r, g, b); ++ if (!is_full_range_in) { ++ c = (c - LIMITED_BLACK) / LIMITED_RANGE; ++ } ++ return c; +} + +float3 yuv2lrgb(float3 yuv) { @@ -296,25 +298,26 @@ Index: FFmpeg/libavfilter/metal/vf_tonemap_videotoolbox.metal +} + +float3 rgb2yuv(float r, float g, float b) { ++ if (!is_full_range_out) { ++ r = r * LIMITED_RANGE + LIMITED_BLACK; ++ g = g * LIMITED_RANGE + LIMITED_BLACK; ++ b = b * LIMITED_RANGE + LIMITED_BLACK; ++ } + float y = (r*yuv_matrix_1[0]) + (g*yuv_matrix_1[1]) + (b*yuv_matrix_1[2]); + float u = (r*yuv_matrix_2[0]) + (g*yuv_matrix_2[1]) + (b*yuv_matrix_2[2]); + float v = (r*yuv_matrix_3[0]) + (g*yuv_matrix_3[1]) + (b*yuv_matrix_3[2]); -+ if (is_full_range_out) { -+ u += 0.5f; -+ v += 0.5f; -+ } else { -+ y = (219.0f * y + 16.0f) / 255.0f; -+ u = (224.0f * u + 128.0f) / 255.0f; -+ v = (224.0f * v + 128.0f) / 255.0f; -+ } ++ u += 0.5f; ++ v += 0.5f; + return float3(y, u, v); +} + +float rgb2y(float r, float g, float b) { -+ float y = (r*yuv_matrix_1[0]) + (g*yuv_matrix_1[1]) + (b*yuv_matrix_1[2]); + if (!is_full_range_out) { -+ y = (219.0f * y + 16.0f) / 255.0f; ++ r = r * LIMITED_RANGE + LIMITED_BLACK; ++ g = g * LIMITED_RANGE + LIMITED_BLACK; ++ b = b * LIMITED_RANGE + LIMITED_BLACK; + } ++ float y = (r*yuv_matrix_1[0]) + (g*yuv_matrix_1[1]) + (b*yuv_matrix_1[2]); + return y; +} + @@ -936,7 +939,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_videotoolbox.m =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemap_videotoolbox.m -@@ -0,0 +1,1154 @@ +@@ -0,0 +1,1149 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -1026,7 +1029,6 @@ Index: FFmpeg/libavfilter/vf_tonemap_videotoolbox.m + enum TonemapMode tonemap_mode; + enum AVPixelFormat format; + int apply_dovi; -+ double ref_white; + double peak; + double target_peak; + double param; @@ -1263,7 +1265,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_videotoolbox.m + + MTLFunctionConstantValues* constant_values = [MTLFunctionConstantValues new]; + dispatch_data_t lib_data; -+ float ref_white; ++ float ref_white = REFERENCE_WHITE_ALT; + float tone_param; + float desat_param; + float target_peak; @@ -1315,10 +1317,7 @@ Index: FFmpeg/libavfilter/vf_tonemap_videotoolbox.m + if (isnan(ctx->param)) + ctx->param = 1.0f; + -+ ctx->ref_white = ctx->tonemap == TONEMAP_BT2390 ? REFERENCE_WHITE_ALT -+ : REFERENCE_WHITE; -+ -+ if (ctx->tonemap == TONEMAP_BT2390 && ctx->peak) ++ if (ctx->peak) + ctx->peak = FFMAX(ctx->peak / 10.0f, 1.1f); + + // SDR peak is 1.0f @@ -1403,12 +1402,11 @@ Index: FFmpeg/libavfilter/vf_tonemap_videotoolbox.m + } + + tone_param = (float)ctx->param; -+ ref_white = (float)ctx->ref_white; + desat_param = (float)ctx->desat_param; + target_peak = (float)ctx->target_peak; + scene_threshold = (float)ctx->scene_threshold; -+ pq_max_lum_div_ref_white = (float)(ST2084_MAX_LUMINANCE / ctx->ref_white); -+ ref_white_div_pq_max_lum = (float)(ctx->ref_white / ST2084_MAX_LUMINANCE); ++ pq_max_lum_div_ref_white = (float)(ST2084_MAX_LUMINANCE / ref_white); ++ ref_white_div_pq_max_lum = (float)(ref_white / ST2084_MAX_LUMINANCE); + tonemap_func_type = (short)ctx->tonemap; + is_tone_func_bt2390 = ctx->tonemap == TONEMAP_BT2390; + is_tone_mode_rgb = ctx->tonemap_mode == TONEMAP_MODE_RGB; diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch index 5419eb7616d..0daa46f9e10 100644 --- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch +++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch @@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c -@@ -0,0 +1,2149 @@ +@@ -0,0 +1,2142 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -347,10 +347,8 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + float32x4_t b_linx4a; + float32x4_t b_linx4b; + float32x4_t offset = vdupq_n_f32(0.5f); -+ int32x4_t output_upper_bound = vdupq_n_s32(32767); ++ int32x4_t output_upper_bound = vdupq_n_s32(INT16_MAX); + int32x4_t zerox4 = vdupq_n_s32(0); -+ int16x8_t input_lut_offset = vdupq_n_s16(2048); -+ int16x8_t input_upper_bound = vdupq_n_s16(32767); + int16x8_t r, g, b; + int32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; + @@ -360,21 +358,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + g = vreinterpretq_s16_u16(g_in); + b = vreinterpretq_s16_u16(b_in); + -+ sig8 = vmaxq_s16(r, vmaxq_s16(g, b)); -+ sig8 = vaddq_s16(sig8, input_lut_offset); -+ sig8 = vminq_s16(sig8, input_upper_bound); -+ sig8 = vmaxq_s16(sig8, vreinterpretq_s16_s32(zerox4)); -+ -+ r = vaddq_s16(r, input_lut_offset); -+ r = vminq_s16(r, input_upper_bound); + r = vmaxq_s16(r, vreinterpretq_s16_s32(zerox4)); -+ g = vaddq_s16(g, input_lut_offset); -+ g = vminq_s16(g, input_upper_bound); + g = vmaxq_s16(g, vreinterpretq_s16_s32(zerox4)); -+ b = vaddq_s16(b, input_lut_offset); -+ b = vminq_s16(b, input_upper_bound); + b = vmaxq_s16(b, vreinterpretq_s16_s32(zerox4)); + ++ sig8 = vmaxq_s16(r, vmaxq_s16(g, b)); ++ + // Cannot use loop here as the lane has to be compile-time constant +#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \ +mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)]; \ @@ -462,12 +451,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + g_linx4b = vmulq_f32(g_linx4b, mapvalx4b); + b_linx4b = vmulq_f32(b_linx4b, mapvalx4b); + -+ r_linx4a = vfmaq_n_f32(offset, r_linx4a, 32767); -+ r_linx4b = vfmaq_n_f32(offset, r_linx4b, 32767); -+ g_linx4a = vfmaq_n_f32(offset, g_linx4a, 32767); -+ g_linx4b = vfmaq_n_f32(offset, g_linx4b, 32767); -+ b_linx4a = vfmaq_n_f32(offset, b_linx4a, 32767); -+ b_linx4b = vfmaq_n_f32(offset, b_linx4b, 32767); ++ r_linx4a = vfmaq_n_f32(offset, r_linx4a, INT16_MAX); ++ r_linx4b = vfmaq_n_f32(offset, r_linx4b, INT16_MAX); ++ g_linx4a = vfmaq_n_f32(offset, g_linx4a, INT16_MAX); ++ g_linx4b = vfmaq_n_f32(offset, g_linx4b, INT16_MAX); ++ b_linx4a = vfmaq_n_f32(offset, b_linx4a, INT16_MAX); ++ b_linx4b = vfmaq_n_f32(offset, b_linx4b, INT16_MAX); + + rx4a = vcvtq_s32_f32(r_linx4a); + rx4a = vminq_s32(rx4a, output_upper_bound); @@ -640,9 +629,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = vmulq_n_f32(rx4a, 28672.0f); -+ gx4a = vmulq_n_f32(gx4a, 28672.0f); -+ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ rx4a = vmulq_n_f32(rx4a, JPEG_SCALE); ++ gx4a = vmulq_n_f32(gx4a, JPEG_SCALE); ++ bx4a = vmulq_n_f32(bx4a, JPEG_SCALE); + + // Reshape y0x4b + ia1 = vzip1q_f32(y0x4b, ux4b); @@ -671,14 +660,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = vmulq_n_f32(rx4b, 28672.0f); -+ gx4b = vmulq_n_f32(gx4b, 28672.0f); -+ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ rx4b = vmulq_n_f32(rx4b, JPEG_SCALE); ++ gx4b = vmulq_n_f32(gx4b, JPEG_SCALE); ++ bx4b = vmulq_n_f32(bx4b, JPEG_SCALE); + + r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); + g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); + b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); + ++ r0x8 = vminq_u16(r0x8, vdupq_n_u16(INT16_MAX)); ++ g0x8 = vminq_u16(g0x8, vdupq_n_u16(INT16_MAX)); ++ b0x8 = vminq_u16(b0x8, vdupq_n_u16(INT16_MAX)); ++ + // Reshape y1x4a + ia1 = vzip1q_f32(y1x4a, ux4a); + ia2 = vzip2q_f32(y1x4a, ux4a); @@ -706,9 +699,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = vmulq_n_f32(rx4a, 28672.0f); -+ gx4a = vmulq_n_f32(gx4a, 28672.0f); -+ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ rx4a = vmulq_n_f32(rx4a, JPEG_SCALE); ++ gx4a = vmulq_n_f32(gx4a, JPEG_SCALE); ++ bx4a = vmulq_n_f32(bx4a, JPEG_SCALE); + + // Reshape y1x4b + ia1 = vzip1q_f32(y1x4b, ux4b); @@ -737,14 +730,19 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = vmulq_n_f32(rx4b, 28672.0f); -+ gx4b = vmulq_n_f32(gx4b, 28672.0f); -+ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ rx4b = vmulq_n_f32(rx4b, JPEG_SCALE); ++ gx4b = vmulq_n_f32(gx4b, JPEG_SCALE); ++ bx4b = vmulq_n_f32(bx4b, JPEG_SCALE); + + r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); + g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); + b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); + ++ r1x8 = vminq_u16(r1x8, vdupq_n_u16(INT16_MAX)); ++ g1x8 = vminq_u16(g1x8, vdupq_n_u16(INT16_MAX)); ++ b1x8 = vminq_u16(b1x8, vdupq_n_u16(INT16_MAX)); ++ ++ + tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, + params->lin_lut, params->tonemap_lut, params->delin_lut, + params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, @@ -1118,12 +1116,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint8_t *rdsty = dsty; @@ -1202,13 +1200,11 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y0x8 = vld1q_u16(srcy + x); + y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); + uvx8 = vld1q_u16(srcuv + x); -+ if (in_depth == 10) { -+ // shift to low10bits for 10bit input -+ // shift bit has to be compile-time constant -+ y0x8 = vshrq_n_u16(y0x8, 6); -+ y1x8 = vshrq_n_u16(y1x8, 6); -+ uvx8 = vshrq_n_u16(uvx8, 6); -+ } ++ // shift to low10bits for 10bit input ++ // shift bit has to be compile-time constant ++ y0x8 = vshrq_n_u16(y0x8, 6); ++ y1x8 = vshrq_n_u16(y1x8, 6); ++ uvx8 = vshrq_n_u16(uvx8, 6); + y0x8 = vsubq_u16(y0x8, in_yuv_offx8); + y1x8 = vsubq_u16(y1x8, in_yuv_offx8); + uvx8 = vsubq_u16(uvx8, in_uv_offx8); @@ -1364,11 +1360,11 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_p010_2_nv12(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} @@ -1497,9 +1493,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = vmulq_n_f32(rx4a, 28672.0f); -+ gx4a = vmulq_n_f32(gx4a, 28672.0f); -+ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ rx4a = vmulq_n_f32(rx4a, JPEG_SCALE); ++ gx4a = vmulq_n_f32(gx4a, JPEG_SCALE); ++ bx4a = vmulq_n_f32(bx4a, JPEG_SCALE); + + // Reshape y0x4b + ia1 = vzip1q_f32(y0x4b, ux4b); @@ -1528,13 +1524,16 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = vmulq_n_f32(rx4b, 28672.0f); -+ gx4b = vmulq_n_f32(gx4b, 28672.0f); -+ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ rx4b = vmulq_n_f32(rx4b, JPEG_SCALE); ++ gx4b = vmulq_n_f32(gx4b, JPEG_SCALE); ++ bx4b = vmulq_n_f32(bx4b, JPEG_SCALE); + + r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); + g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); + b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); ++ r0x8 = vminq_u16(r0x8, vdupq_n_u16(INT16_MAX)); ++ g0x8 = vminq_u16(g0x8, vdupq_n_u16(INT16_MAX)); ++ b0x8 = vminq_u16(b0x8, vdupq_n_u16(INT16_MAX)); + + // Reshape y1x4a + ia1 = vzip1q_f32(y1x4a, ux4a); @@ -1563,9 +1562,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = vmulq_n_f32(rx4a, 28672.0f); -+ gx4a = vmulq_n_f32(gx4a, 28672.0f); -+ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ rx4a = vmulq_n_f32(rx4a, JPEG_SCALE); ++ gx4a = vmulq_n_f32(gx4a, JPEG_SCALE); ++ bx4a = vmulq_n_f32(bx4a, JPEG_SCALE); + + // Reshape y1x4b + ia1 = vzip1q_f32(y1x4b, ux4b); @@ -1594,13 +1593,16 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = vmulq_n_f32(rx4b, 28672.0f); -+ gx4b = vmulq_n_f32(gx4b, 28672.0f); -+ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ rx4b = vmulq_n_f32(rx4b, JPEG_SCALE); ++ gx4b = vmulq_n_f32(gx4b, JPEG_SCALE); ++ bx4b = vmulq_n_f32(bx4b, JPEG_SCALE); + + r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); + g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); + b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); ++ r1x8 = vminq_u16(r1x8, vdupq_n_u16(INT16_MAX)); ++ g1x8 = vminq_u16(g1x8, vdupq_n_u16(INT16_MAX)); ++ b1x8 = vminq_u16(b1x8, vdupq_n_u16(INT16_MAX)); + + tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -1973,12 +1975,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_p010_2_p010_neon(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint16_t *rdsty = dsty; @@ -2047,7 +2049,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; + int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); + int32x4_t out_rndx4 = vdupq_n_s32(out_rnd); -+ int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2); ++ int32x4_t out_sh2x4 = vdupq_n_s32(out_sh2); + int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); + for (; height > 1; height -= 2, @@ -2188,45 +2190,36 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); + vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); + -+ switch(out_depth) { -+ default: -+ case 10: -+ y0oax4 = vshrq_n_s32(y0oax4, 19); -+ y0obx4 = vshrq_n_s32(y0obx4, 19); -+ y1oax4 = vshrq_n_s32(y1oax4, 19); -+ y1obx4 = vshrq_n_s32(y1obx4, 19); -+ uox4 = vshrq_n_s32(uox4, 19); -+ vox4 = vshrq_n_s32(vox4, 19); -+ break; -+ case 16: -+ y0oax4 = vshrq_n_s32(y0oax4, 13); -+ y0obx4 = vshrq_n_s32(y0obx4, 13); -+ y1oax4 = vshrq_n_s32(y1oax4, 13); -+ y1obx4 = vshrq_n_s32(y1obx4, 13); -+ uox4 = vshrq_n_s32(uox4, 13); -+ vox4 = vshrq_n_s32(vox4, 13); -+ break; -+ } ++ y0oax4 = vshrq_n_s32(y0oax4, 19); ++ y0obx4 = vshrq_n_s32(y0obx4, 19); ++ y1oax4 = vshrq_n_s32(y1oax4, 19); ++ y1obx4 = vshrq_n_s32(y1obx4, 19); ++ uox4 = vshrq_n_s32(uox4, 19); ++ vox4 = vshrq_n_s32(vox4, 19); + + y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); ++ y0oax4 = vshlq_s32(y0oax4, out_sh2x4); + y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); ++ y0obx4 = vshlq_s32(y0obx4, out_sh2x4); + y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); ++ y1oax4 = vshlq_s32(y1oax4, out_sh2x4); + y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); ++ y1obx4 = vshlq_s32(y1obx4, out_sh2x4); + uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ uox4 = vshlq_s32(uox4, out_sh2x4); + vox4 = vaddq_s32(vox4, out_uv_offsetx4); ++ vox4 = vshlq_s32(vox4, out_sh2x4); + + y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); -+ y0ox8 = vshlq_u16(y0ox8, out_sh2x8); + vst1q_u16(&dsty[x], y0ox8); + + y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); -+ y1ox8 = vshlq_u16(y1ox8, out_sh2x8); + vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); + + uvoax4 = vzip1q_s32(uox4, vox4); + uvobx4 = vzip2q_s32(uox4, vox4); + -+ vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8)); ++ vst1q_u16(&dstuv[x], vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4))); + } + } + @@ -2237,11 +2230,11 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_p010_2_p010(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} @@ -2289,12 +2282,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h + int width, int height, + const struct TonemapIntParams *params); + -+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++void tonemap_frame_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +void tonemap_frame_dovi_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, @@ -2310,12 +2303,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h + int width, int height, + const struct TonemapIntParams *params); + -+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++void tonemap_frame_p010_2_p010_neon(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +#endif // AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H Index: FFmpeg/libavfilter/allfilters.c @@ -2342,7 +2335,7 @@ Index: FFmpeg/libavfilter/colorspace.c #include "libavutil/frame.h" #include "libavutil/mastering_display_metadata.h" #include "libavutil/pixdesc.h" -@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) { +@@ -354,3 +355,53 @@ float inverse_eotf_arib_b67(float x) { float inverse_eotf_bt1886(float x) { return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f; } @@ -2369,15 +2362,17 @@ Index: FFmpeg/libavfilter/colorspace.c +} + +void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3], -+ int depth, int y_rng, int uv_rng, int yuv2rgb) ++ int depth, int y_rng, int uv_rng, int yuv2rgb, int is_full_range) +{ +#define N (yuv2rgb ? m : n) +#define M (yuv2rgb ? n : m) + int rng, n, m, o; ++ int range_scale = is_full_range ? 32767 : 28032; ++ int range_scale_uv = is_full_range ? 32767 : 28672; + int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth)); -+ for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) { ++ for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng, range_scale = range_scale_uv) { + for (m = 0; m < 3; m++) { -+ out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672)); ++ out[N][M][0] = (int16_t)lrint(bits * (yuv2rgb ? range_scale : rng) * table[N][M] / (yuv2rgb ? rng : range_scale)); + for (o = 1; o < 8; o++) + out[N][M][o] = out[N][M][0]; + } @@ -2405,13 +2400,13 @@ Index: FFmpeg/libavfilter/colorspace.h +int ff_get_range_off(int *off, int *y_rng, int *uv_rng, + enum AVColorRange rng, int depth); +void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3], -+ int depth, int y_rng, int uv_rng, int yuv2rgb); ++ int depth, int y_rng, int uv_rng, int yuv2rgb, int is_full_range); #endif Index: FFmpeg/libavfilter/vf_tonemapx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemapx.c -@@ -0,0 +1,1778 @@ +@@ -0,0 +1,1791 @@ +/* + * This file is part of FFmpeg. + * @@ -2470,6 +2465,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +#include "internal.h" +#include "video.h" + ++#define MIX(x, y, a) ((x) + ((y) - (x)) * (a)) ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++ +enum TonemapAlgorithm { + TONEMAP_NONE, + TONEMAP_LINEAR, @@ -2569,7 +2567,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +static const enum AVPixelFormat in_pix_fmts[] = { + AV_PIX_FMT_YUV420P10, + AV_PIX_FMT_P010, -+ AV_PIX_FMT_P016, + AV_PIX_FMT_NONE, +}; + @@ -2578,15 +2575,14 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + AV_PIX_FMT_YUV420P10, + AV_PIX_FMT_NV12, + AV_PIX_FMT_P010, -+ AV_PIX_FMT_P016, +}; + +const double dovi_lms2rgb_matrix[3][3] = -+ { -+ { 3.06441879, -2.16597676, 0.10155818}, -+ {-0.65612108, 1.78554118, -0.12943749}, -+ { 0.01736321, -0.04725154, 1.03004253}, -+ }; ++{ ++ { 3.06441879, -2.16597676, 0.10155818}, ++ {-0.65612108, 1.78554118, -0.12943749}, ++ { 0.01736321, -0.04725154, 1.03004253}, ++}; + +static void update_dovi_buf(AVFilterContext *ctx) +{ @@ -2730,7 +2726,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + dest[2] = l * (float)lms2rgb_matrix[2][0] + m * (float)lms2rgb_matrix[2][1] + s * (float)lms2rgb_matrix[2][2]; +} + -+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) +inline static void reshape_dovi_yuv(float* dest, float* src, const TonemapIntParams *ctx) +{ + int i; @@ -2771,41 +2766,40 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + coeffs[2] = dovi_coeffs[0*4+2]; + coeffs[3] = dovi_coeffs[0*4+3]; + -+#define mix(x, y, a) ((x) + ((y) - (x)) * (a)) + if (i == 0 && dovi_num_pivots > 2) { + int t0 = s >= dovi_pivots[0], t1 = s >= dovi_pivots[1]; + int t2 = s >= dovi_pivots[2], t3 = s >= dovi_pivots[3]; + int t4 = s >= dovi_pivots[4], t5 = s >= dovi_pivots[5], t6 = s >= dovi_pivots[6]; + -+ float m01[4] = { mix(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), -+ mix(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), -+ mix(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), -+ mix(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; -+ float m23[4] = { mix(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), -+ mix(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), -+ mix(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), -+ mix(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; -+ float m0123[4] = { mix(m01[0], m23[0], t1), -+ mix(m01[1], m23[1], t1), -+ mix(m01[2], m23[2], t1), -+ mix(m01[3], m23[3], t1) }; -+ float m45[4] = { mix(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), -+ mix(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), -+ mix(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), -+ mix(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; -+ float m67[4] = { mix(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), -+ mix(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), -+ mix(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), -+ mix(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; -+ float m4567[4] = { mix(m45[0], m67[0], t5), -+ mix(m45[1], m67[1], t5), -+ mix(m45[2], m67[2], t5), -+ mix(m45[3], m67[3], t5) }; -+ -+ coeffs[0] = mix(m0123[0], m4567[0], t3); -+ coeffs[1] = mix(m0123[1], m4567[1], t3); -+ coeffs[2] = mix(m0123[2], m4567[2], t3); -+ coeffs[3] = mix(m0123[3], m4567[3], t3); ++ float m01[4] = { MIX(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), ++ MIX(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), ++ MIX(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), ++ MIX(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; ++ float m23[4] = { MIX(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), ++ MIX(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), ++ MIX(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), ++ MIX(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; ++ float m0123[4] = { MIX(m01[0], m23[0], t1), ++ MIX(m01[1], m23[1], t1), ++ MIX(m01[2], m23[2], t1), ++ MIX(m01[3], m23[3], t1) }; ++ float m45[4] = { MIX(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), ++ MIX(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), ++ MIX(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), ++ MIX(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; ++ float m67[4] = { MIX(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), ++ MIX(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), ++ MIX(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), ++ MIX(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; ++ float m4567[4] = { MIX(m45[0], m67[0], t5), ++ MIX(m45[1], m67[1], t5), ++ MIX(m45[2], m67[2], t5), ++ MIX(m45[3], m67[3], t5) }; ++ ++ coeffs[0] = MIX(m0123[0], m4567[0], t3); ++ coeffs[1] = MIX(m0123[1], m4567[1], t3); ++ coeffs[2] = MIX(m0123[2], m4567[2], t3); ++ coeffs[3] = MIX(m0123[3], m4567[3], t3); + } + + has_mmr_poly = dovi_has_mmr && dovi_has_poly; @@ -2856,21 +2850,21 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +static float bt2390(float s, float peak) +{ + float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT); -+ float scale = 1.0f / peak_pq; ++ float scale = peak_pq > 0.0f ? (1.0f / peak_pq) : 1.0f; + + // SDR peak + float dst_peak = 1.0f; + float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale; -+ float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; ++ float max_lum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; + -+ float ks = 1.5f * maxLum - 0.5f; ++ float ks = 1.5f * max_lum - 0.5f; + float tb = (s_pq - ks) / (1.0f - ks); + float tb2 = tb * tb; + float tb3 = tb2 * tb; + float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks + + (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) + -+ (-2.0f * tb3 + 3.0f * tb2) * maxLum; -+ float sig = (s_pq < ks) ? s_pq : pb; ++ (-2.0f * tb3 + 3.0f * tb2) * max_lum; ++ float sig = MIX(pb, s_pq, s_pq < ks); + + return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT); +} @@ -2939,10 +2933,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + return AVERROR(ENOMEM); + + for (i = 0; i < 32768; i++) { -+ double v1 = (i - 2048.0f) / 28672.0f; -+ double v2 = i / 32767.0f; -+ s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0); -+ s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f)); ++ float v = (float)i / JPEG_SCALE; ++ s->lin_lut[i] = FFMAX(linearize(v, trc_src), 0); ++ s->delin_lut[i] = av_clip_int16((int)rintf(delinearize(v, trc_dst) * JPEG_SCALE)); + } + + return 0; @@ -2957,8 +2950,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + return AVERROR(ENOMEM); + + for (i = 0; i < 32768; i++) { -+ double v = (i - 2048.0f) / 28672.0f; -+ double sig = linearize(v, trc_src); ++ float v = (float)i / JPEG_SCALE; ++ float sig = linearize(v, trc_src); + float mapped = mapsig(s->tonemap, sig, peak, s->param); + s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f; + } @@ -2992,7 +2985,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv); + + ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth, -+ y_rng, uv_rng, 1); ++ y_rng, uv_rng, 1, irng == AVCOL_RANGE_JPEG); + + res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng, + orng, odesc->comp[0].depth); @@ -3004,7 +2997,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + } + + ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth, -+ y_rng, uv_rng, 0); ++ y_rng, uv_rng, 0, orng == AVCOL_RANGE_JPEG); + + return 0; +} @@ -3069,33 +3062,38 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + lms2rgb(c3, c3[0], c3[1], c3[2], params->dovi->linear, *params->lms2rgb_matrix); + lms2rgb(c4, c4[0], c4[1], c4[2], params->dovi->linear, *params->lms2rgb_matrix); + -+ r[0] = av_clip_int16(c1[0] * 28672); -+ r[1] = av_clip_int16(c2[0] * 28672); -+ r[2] = av_clip_int16(c3[0] * 28672); -+ r[3] = av_clip_int16(c4[0] * 28672); -+ -+ g[0] = av_clip_int16(c1[1] * 28672); -+ g[1] = av_clip_int16(c2[1] * 28672); -+ g[2] = av_clip_int16(c3[1] * 28672); -+ g[3] = av_clip_int16(c4[1] * 28672); -+ -+ b[0] = av_clip_int16(c1[2] * 28672); -+ b[1] = av_clip_int16(c2[2] * 28672); -+ b[2] = av_clip_int16(c3[2] * 28672); -+ b[3] = av_clip_int16(c4[2] * 28672); ++ // DoVi always uses full range ++ r[0] = av_clip_uintp2((int)(c1[0] * JPEG_SCALE), 15); ++ r[1] = av_clip_uintp2((int)(c2[0] * JPEG_SCALE), 15); ++ r[2] = av_clip_uintp2((int)(c3[0] * JPEG_SCALE), 15); ++ r[3] = av_clip_uintp2((int)(c4[0] * JPEG_SCALE), 15); ++ ++ g[0] = av_clip_uintp2((int)(c1[1] * JPEG_SCALE), 15); ++ g[1] = av_clip_uintp2((int)(c2[1] * JPEG_SCALE), 15); ++ g[2] = av_clip_uintp2((int)(c3[1] * JPEG_SCALE), 15); ++ g[3] = av_clip_uintp2((int)(c4[1] * JPEG_SCALE), 15); ++ ++ b[0] = av_clip_uintp2((int)(c1[2] * JPEG_SCALE), 15); ++ b[1] = av_clip_uintp2((int)(c2[2] * JPEG_SCALE), 15); ++ b[2] = av_clip_uintp2((int)(c3[2] * JPEG_SCALE), 15); ++ b[3] = av_clip_uintp2((int)(c4[2] * JPEG_SCALE), 15); +} + +inline static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in, -+ int16_t *r_out, int16_t *g_out, int16_t *b_out, -+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, double desat, -+ double (*rgb2rgb)[3][3], -+ int rgb2rgb_passthrough) ++ int16_t *r_out, int16_t *g_out, int16_t *b_out, ++ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, double desat, ++ double (*rgb2rgb)[3][3], ++ int rgb2rgb_passthrough) +{ + int16_t sig; + float mapval, r_lin, g_lin, b_lin; + ++ r_in = av_clip_uintp2(r_in, 15); ++ g_in = av_clip_uintp2(g_in, 15); ++ b_in = av_clip_uintp2(b_in, 15); ++ + /* load values */ + *r_out = r_in; + *g_out = g_in; @@ -3106,11 +3104,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + * out-of-bounds clipping */ + sig = FFMAX3(r_in, g_in, b_in); + -+ mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)]; ++ mapval = tonemap_lut[sig]; + -+ r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)]; -+ g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)]; -+ b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)]; ++ r_lin = lin_lut[r_in]; ++ g_lin = lin_lut[g_in]; ++ b_lin = lin_lut[b_in]; + + if (!rgb2rgb_passthrough) { + r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin; @@ -3118,7 +3116,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin; + } + -+#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a) + /* desaturate to prevent unnatural colors */ + if (desat > 0) { + float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin; @@ -3131,7 +3128,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + r_lin *= mapval; + g_lin *= mapval; + b_lin *= mapval; -+#undef MIX + + *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)]; + *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)]; @@ -3139,12 +3135,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +} + +// See also libavfilter/colorspacedsp_template.c -+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ + const int in_depth = srcdepth; + const int in_uv_offset = 128 << (in_depth - 8); @@ -3235,6 +3231,103 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + } +} + ++void tonemap_frame_p010_2_p010(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ const int in_sh2 = 16 - in_depth; ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ const int out_sh2 = 16 - out_depth; ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] >> in_sh2) - params->in_yuv_off; ++ int y01 = (srcy[x + 1] >> in_sh2) - params->in_yuv_off; ++ int y10 = (srcy[srclinesize[0] / 2 + x] >> in_sh2) - params->in_yuv_off; ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off; ++ int u = (srcuv[x] >> in_sh2) - in_uv_offset; ++ int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset; ++ ++ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); ++ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); ++ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); ++ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ ++ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ ++ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); ++ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); ++ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); ++ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstuv[x] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16); ++ dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16); ++#undef AVG ++ } ++ } ++} ++ +void tonemap_frame_dovi_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, @@ -3573,103 +3666,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + } +} + -+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) -+{ -+ const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); -+ const int in_sh2 = 16 - in_depth; -+ -+ const int out_depth = dstdepth; -+ const int out_uv_offset = 128 << (out_depth - 8); -+ const int out_sh = 29 - out_depth; -+ const int out_rnd = 1 << (out_sh - 1); -+ const int out_sh2 = 16 - out_depth; -+ -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; -+ -+ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; -+ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; -+ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; -+ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; -+ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; -+ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; -+ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; -+ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; -+ -+ int r00, g00, b00; -+ int r01, g01, b01; -+ int r10, g10, b10; -+ int r11, g11, b11; -+ -+ int16_t r[4], g[4], b[4]; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { -+ for (int x = 0; x < width; x += 2) { -+ int y00 = (srcy[x] >> in_sh2) - params->in_yuv_off; -+ int y01 = (srcy[x + 1] >> in_sh2) - params->in_yuv_off; -+ int y10 = (srcy[srclinesize[0] / 2 + x] >> in_sh2) - params->in_yuv_off; -+ int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off; -+ int u = (srcuv[x] >> in_sh2) - in_uv_offset; -+ int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset; -+ -+ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); -+ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); -+ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); -+ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); -+ -+ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ -+ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); -+ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); -+ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); -+ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); -+ -+ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ -+ r00 = r[0], g00 = g[0], b00 = b[0]; -+ r01 = r[1], g01 = g[1], b01 = b[1]; -+ r10 = r[2], g10 = g[2], b10 = b[2]; -+ r11 = r[3], g11 = g[3], b11 = b[3]; -+ -+ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ -+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) -+ dstuv[x] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16); -+ dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16); -+#undef AVG -+ } -+ } -+} -+ +#define LOAD_TONEMAP_PARAMS TonemapxContext *s = ctx->priv; \ +ThreadData *td = arg; \ +AVFrame *in = td->in; \ @@ -3695,8 +3691,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +.desat = s->desat, \ +.dovi = s->dovi, \ +.dovi_pbuf = s->dovi_pbuf, \ -+.lms2rgb_matrix = &s->lms2rgb_matrix, \ -+.ycc_offset = &s->ycc_offset \ ++.lms2rgb_matrix = &s->lms2rgb_matrix, \ ++.ycc_offset = &s->ycc_offset \ +}; + +static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) @@ -3780,6 +3776,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + TonemapxContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + AVFrame *out; ++ AVFrameSideData *dovi_sd = NULL; + const AVPixFmtDescriptor *desc; + const AVPixFmtDescriptor *odesc; + int ret; @@ -3821,10 +3818,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + if ((ret = av_frame_copy_props(out, in)) < 0) + goto fail; + ++ if (s->apply_dovi) { ++ dovi_sd = av_frame_get_side_data(in, AV_FRAME_DATA_DOVI_METADATA); ++ } ++ + /* read peak from side data if not passed in */ + if (!peak) { -+ peak = ff_determine_signal_peak(in); -+ av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak); ++ if (dovi_sd) { ++ const AVDOVIMetadata *metadata = (AVDOVIMetadata *) dovi_sd->data; ++ s->peak = ff_determine_dovi_signal_peak(metadata); ++ } else { ++ s->peak = ff_determine_signal_peak(in); ++ } + } + + out->color_trc = s->trc == -1 ? AVCOL_TRC_UNSPECIFIED : s->trc; @@ -3852,6 +3857,40 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + if (out->color_range == AVCOL_RANGE_UNSPECIFIED) + out->color_range = AVCOL_RANGE_MPEG; + ++ if (dovi_sd) { ++ const AVDOVIMetadata *metadata = (AVDOVIMetadata *) dovi_sd->data; ++ const AVDOVIRpuDataHeader *rpu = av_dovi_get_header(metadata); ++ // only map dovi rpus that don't require an EL and has rpu profile == 0 ++ // for performance reason we only want to do reshaping when absolutely needed ++ // such videos usually have vdr_rpu_profile == 0, for example profile 5 videos ++ // this could be wrong as there is no public documentation on this field ++ if (rpu->disable_residual_flag && rpu->vdr_rpu_profile == 0) { ++ struct DoviMetadata *dovi = av_malloc(sizeof(*dovi)); ++ s->dovi = dovi; ++ if (!s->dovi) ++ goto fail; ++ ++ ff_map_dovi_metadata(s->dovi, metadata); ++ in->color_trc = AVCOL_TRC_SMPTE2084; ++ in->colorspace = AVCOL_SPC_BT2020_NCL; ++ in->color_primaries = AVCOL_PRI_BT2020; ++ } ++ } ++ ++ if (s->dovi) { ++ if (desc->comp[2].plane == 1) { ++ av_log(s, AV_LOG_ERROR, "Input pixel format has to be yuv420p10 for Dolby Vision reshaping\n"); ++ av_assert0(0); ++ } ++ update_dovi_buf(ctx); ++ ff_matrix_mul_3x3(s->lms2rgb_matrix, dovi_lms2rgb_matrix, s->dovi->linear); ++ s->ycc_offset[0] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[0][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[0][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[0][2]; ++ s->ycc_offset[1] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[1][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[1][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[1][2]; ++ s->ycc_offset[2] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[2][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[2][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[2][2]; ++ s->tonemap_func_planar8 = s->tonemap_func_dovi8; ++ s->tonemap_func_planar10 = s->tonemap_func_dovi10; ++ } ++ + if (!s->lin_lut || !s->delin_lut) { + if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0) + goto fail; @@ -3859,7 +3898,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + + if (!s->tonemap_lut || s->lut_peak != peak) { + s->lut_peak = peak; -+ if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0) ++ if ((ret = compute_tonemap_lut(s, in->color_trc)) < 0) + goto fail; + } + @@ -3874,40 +3913,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + goto fail; + } + -+ if (s->apply_dovi) { -+ AVFrameSideData *dovi_sd = av_frame_get_side_data(in, AV_FRAME_DATA_DOVI_METADATA); -+ if (dovi_sd) { -+ const AVDOVIMetadata *metadata = (AVDOVIMetadata *) dovi_sd->data; -+ const AVDOVIRpuDataHeader *rpu = av_dovi_get_header(metadata); -+ // only map dovi rpus that don't require an EL and has rpu profile == 0 -+ // for performance reason we only want to do reshaping when absolutely needed -+ // such videos usually have vdr_rpu_profile == 0, for example profile 5 videos -+ // this could be wrong as there is no public documentation on this field -+ if (rpu->disable_residual_flag && rpu->vdr_rpu_profile == 0) { -+ struct DoviMetadata *dovi = av_malloc(sizeof(*dovi)); -+ s->dovi = dovi; -+ if (!s->dovi) -+ goto fail; -+ -+ ff_map_dovi_metadata(s->dovi, metadata); -+ } -+ } -+ -+ if (s->dovi) { -+ if (desc->comp[2].plane == 1) { -+ av_log(s, AV_LOG_ERROR, "Input pixel format has to be yuv420p10 for Dolby Vision reshaping\n"); -+ av_assert0(0); -+ } -+ update_dovi_buf(ctx); -+ ff_matrix_mul_3x3(s->lms2rgb_matrix, dovi_lms2rgb_matrix, s->dovi->linear); -+ s->ycc_offset[0] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[0][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[0][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[0][2]; -+ s->ycc_offset[1] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[1][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[1][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[1][2]; -+ s->ycc_offset[2] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[2][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[2][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[2][2]; -+ s->tonemap_func_planar8 = s->tonemap_func_dovi8; -+ s->tonemap_func_planar10 = s->tonemap_func_dovi10; -+ } -+ } -+ + /* do the tonemap */ + td.in = in; + td.out = out; @@ -4016,8 +4021,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + { + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon; -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon; ++ s->tonemap_func_biplanar8 = tonemap_frame_p010_2_nv12_neon; ++ s->tonemap_func_biplanar10 = tonemap_frame_p010_2_p010_neon; + s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon; + s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon; + s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_neon; @@ -4033,8 +4038,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + { + int cpu_flags = av_get_cpu_flags(); + if (X86_SSE42(cpu_flags)) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse; -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse; ++ s->tonemap_func_biplanar8 = tonemap_frame_p010_2_nv12_sse; ++ s->tonemap_func_biplanar10 = tonemap_frame_p010_2_p010_sse; + s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse; + s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse; + s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_sse; @@ -4049,8 +4054,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + { + int cpu_flags = av_get_cpu_flags(); + if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx; -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx; ++ s->tonemap_func_biplanar8 = tonemap_frame_p010_2_nv12_avx; ++ s->tonemap_func_biplanar10 = tonemap_frame_p010_2_p010_avx; + s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx; + s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx; + s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_avx; @@ -4070,11 +4075,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +#endif + + if (!s->tonemap_func_biplanar8) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12; ++ s->tonemap_func_biplanar8 = tonemap_frame_p010_2_nv12; + } + + if (!s->tonemap_func_biplanar10) { -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010; ++ s->tonemap_func_biplanar10 = tonemap_frame_p010_2_p010; + } + + if (!s->tonemap_func_planar8) { @@ -4127,6 +4132,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + if (isnan(s->param)) + s->param = 1.0f; + ++ if (s->peak) ++ s->peak = FFMAX(s->peak / 10.0f, 1.1f); ++ + return 0; +} + @@ -4194,7 +4202,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemapx.h -@@ -0,0 +1,126 @@ +@@ -0,0 +1,127 @@ +/* + * This file is part of FFmpeg. + * @@ -4252,6 +4260,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h +#define pivots_sz pivots_cnt*sizeof(float) +#define coeffs_sz coeffs_cnt*sizeof(float) +#define mmr_sz mmr_cnt*sizeof(float) ++#define JPEG_SCALE 32767.0f + +typedef struct TonemapIntParams { + double lut_peak; @@ -4292,12 +4301,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h + int width, int height, + const struct TonemapIntParams *params); + -+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++void tonemap_frame_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +void tonemap_frame_dovi_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, @@ -4313,12 +4322,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h + int width, int height, + const struct TonemapIntParams *params); + -+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++void tonemap_frame_p010_2_p010(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +#endif // AVFILTER_TONEMAPX_H Index: FFmpeg/libavfilter/x86/Makefile @@ -4338,7 +4347,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c -@@ -0,0 +1,2276 @@ +@@ -0,0 +1,2293 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -4672,27 +4681,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256 mapvalx8, r_linx8, g_linx8, b_linx8; + __m256 offset = _mm256_set1_ps(0.5f); + __m256i zerox8 = _mm256_setzero_si256(); -+ __m256i input_lut_offset = _mm256_set1_epi32(2048); + __m256i upper_bound = _mm256_set1_epi32(32767); + __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f); + __m256i r, g, b, rx8, gx8, bx8; + + float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8]; + -+ sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in)); -+ sig8 = _mm256_add_epi32(sig8, input_lut_offset); -+ sig8 = _mm256_min_epi32(sig8, upper_bound); -+ sig8 = _mm256_max_epi32(sig8, zerox8); -+ -+ r = _mm256_add_epi32(r_in, input_lut_offset); -+ r = _mm256_min_epi32(r, upper_bound); -+ r = _mm256_max_epi32(r, zerox8); -+ g = _mm256_add_epi32(g_in, input_lut_offset); -+ g = _mm256_min_epi32(g, upper_bound); -+ g = _mm256_max_epi32(g, zerox8); -+ b = _mm256_add_epi32(b_in, input_lut_offset); -+ b = _mm256_min_epi32(b, upper_bound); -+ b = _mm256_max_epi32(b, zerox8); ++ r = _mm256_max_epi32(r_in, zerox8); ++ g = _mm256_max_epi32(g_in, zerox8); ++ b = _mm256_max_epi32(b_in, zerox8); ++ ++ sig8 = _mm256_max_epi32(r, _mm256_max_epi32(g, b)); + +#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \ +r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)]; \ @@ -4896,13 +4895,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); -+ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); -+ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE)); + + r0x8a = _mm256_cvtps_epi32(rx8a); ++ r0x8a = av_clip_int16_avx(r0x8a); + g0x8a = _mm256_cvtps_epi32(gx8a); ++ g0x8a = av_clip_int16_avx(g0x8a); + b0x8a = _mm256_cvtps_epi32(bx8a); ++ b0x8a = av_clip_int16_avx(b0x8a); + + // Reshape y1x8a + reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, @@ -4916,13 +4918,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); -+ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); -+ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE)); + + r1x8a = _mm256_cvtps_epi32(rx8a); ++ r1x8a = av_clip_int16_avx(r1x8a); + g1x8a = _mm256_cvtps_epi32(gx8a); ++ g1x8a = av_clip_int16_avx(g1x8a); + b1x8a = _mm256_cvtps_epi32(bx8a); ++ b1x8a = av_clip_int16_avx(b1x8a); + + // Reshape y0x8b + reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, @@ -4936,13 +4941,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); -+ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); -+ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE)); + + r0x8b = _mm256_cvtps_epi32(rx8b); ++ r0x8b = av_clip_int16_avx(r0x8b); + g0x8b = _mm256_cvtps_epi32(gx8b); ++ g0x8b = av_clip_int16_avx(g0x8b); + b0x8b = _mm256_cvtps_epi32(bx8b); ++ b0x8b = av_clip_int16_avx(b0x8b); + + // Reshape y1x8b + reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, @@ -4956,13 +4964,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); -+ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); -+ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE)); + + r1x8b = _mm256_cvtps_epi32(rx8b); ++ r1x8b = av_clip_int16_avx(r1x8b); + g1x8b = _mm256_cvtps_epi32(gx8b); ++ g1x8b = av_clip_int16_avx(g1x8b); + b1x8b = _mm256_cvtps_epi32(bx8b); ++ b1x8b = av_clip_int16_avx(b1x8b); + + tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -5211,13 +5222,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); -+ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); -+ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE)); + + r0x8a = _mm256_cvtps_epi32(rx8a); ++ r0x8a = av_clip_int16_avx(r0x8a); + g0x8a = _mm256_cvtps_epi32(gx8a); ++ g0x8a = av_clip_int16_avx(g0x8a); + b0x8a = _mm256_cvtps_epi32(bx8a); ++ b0x8a = av_clip_int16_avx(b0x8a); + + // Reshape y1x8a + reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, @@ -5231,13 +5245,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); -+ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); -+ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(JPEG_SCALE)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(JPEG_SCALE)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(JPEG_SCALE)); + + r1x8a = _mm256_cvtps_epi32(rx8a); ++ r1x8a = av_clip_int16_avx(r1x8a); + g1x8a = _mm256_cvtps_epi32(gx8a); ++ g1x8a = av_clip_int16_avx(g1x8a); + b1x8a = _mm256_cvtps_epi32(bx8a); ++ b1x8a = av_clip_int16_avx(b1x8a); + + // Reshape y0x8b + reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, @@ -5251,13 +5268,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); -+ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); -+ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE)); + + r0x8b = _mm256_cvtps_epi32(rx8b); ++ r0x8b = av_clip_int16_avx(r0x8b); + g0x8b = _mm256_cvtps_epi32(gx8b); ++ g0x8b = av_clip_int16_avx(g0x8b); + b0x8b = _mm256_cvtps_epi32(bx8b); ++ b0x8b = av_clip_int16_avx(b0x8b); + + // Reshape y1x8b + reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, @@ -5271,13 +5291,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); + -+ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); -+ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); -+ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(JPEG_SCALE)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(JPEG_SCALE)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(JPEG_SCALE)); + + r1x8b = _mm256_cvtps_epi32(rx8b); ++ r1x8b = av_clip_int16_avx(r1x8b); + g1x8b = _mm256_cvtps_epi32(gx8b); ++ g1x8b = av_clip_int16_avx(g1x8b); + b1x8b = _mm256_cvtps_epi32(bx8b); ++ b1x8b = av_clip_int16_avx(b1x8b); + + tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -6011,12 +6034,12 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS + uint8_t *rdsty = dsty; @@ -6303,21 +6326,21 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_p010_2_nv12(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS + uint16_t *rdsty = dsty; @@ -6518,6 +6541,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); + yoax8 = _mm256_srai_epi32(yoax8, out_sh); + yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ yoax8 = _mm256_slli_epi32(yoax8, out_sh2); + + yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); + yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); @@ -6525,10 +6549,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); + yobx8 = _mm256_srai_epi32(yobx8, out_sh); + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ yobx8 = _mm256_slli_epi32(yobx8, out_sh2); + + y0ox16 = _mm256_packus_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); + + r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); @@ -6549,6 +6573,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); + y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2); + + y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); @@ -6556,10 +6581,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); + y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2); + + y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); @@ -6594,8 +6619,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + + uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); + uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); ++ uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2); ++ uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2); + uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); -+ uvox16 = _mm256_slli_epi16(uvox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); + } + } @@ -6607,11 +6633,11 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_p010_2_p010(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} @@ -6673,26 +6699,26 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h + int width, int height, + const struct TonemapIntParams *params); + -+X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++X86_64_V3 void tonemap_frame_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++X86_64_V3 void tonemap_frame_p010_2_p010_avx(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c -@@ -0,0 +1,2353 @@ +@@ -0,0 +1,2374 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -6983,22 +7009,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + __m128i sig4; + __m128 mapvalx4, r_linx4, g_linx4, b_linx4; + __m128 offset = _mm_set1_ps(0.5f); -+ __m128i input_lut_offset = _mm_set1_epi32(2048); -+ __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f); ++ __m128 intermediate_upper_bound = _mm_set1_ps(JPEG_SCALE); + __m128i r, g, b, rx4, gx4, bx4; + + float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4]; + -+ sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in)); -+ sig4 = _mm_add_epi32(sig4, input_lut_offset); -+ sig4 = av_clip_uint16_sse(sig4); ++ r = av_clip_uint16_sse(r_in); ++ g = av_clip_uint16_sse(g_in); ++ b = av_clip_uint16_sse(b_in); + -+ r = _mm_add_epi32(r_in, input_lut_offset); -+ r = av_clip_uint16_sse(r); -+ g = _mm_add_epi32(g_in, input_lut_offset); -+ g = av_clip_uint16_sse(g); -+ b = _mm_add_epi32(b_in, input_lut_offset); -+ b = av_clip_uint16_sse(b); ++ sig4 = _mm_max_epi32(r, _mm_max_epi32(g, b)); + + // Cannot use loop here as the lane has to be compile-time constant +#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \ @@ -7221,13 +7241,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); -+ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); -+ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE)); + + r0x4a = _mm_cvtps_epi32(rx4a); ++ r0x4a = av_clip_int16_sse(r0x4a); + g0x4a = _mm_cvtps_epi32(gx4a); ++ g0x4a = av_clip_int16_sse(g0x4a); + b0x4a = _mm_cvtps_epi32(bx4a); ++ b0x4a = av_clip_int16_sse(b0x4a); + + // Reshape y1x4a + ia1 = _mm_unpacklo_ps(y1x4af, ux4af); @@ -7261,13 +7284,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); -+ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); -+ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE)); + + r1x4a = _mm_cvtps_epi32(rx4a); ++ r1x4a = av_clip_int16_sse(r1x4a); + g1x4a = _mm_cvtps_epi32(gx4a); ++ g1x4a = av_clip_int16_sse(g1x4a); + b1x4a = _mm_cvtps_epi32(bx4a); ++ b1x4a = av_clip_int16_sse(b1x4a); + + // Reshape y0x4b + ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf); @@ -7301,13 +7327,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); -+ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); -+ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE)); + + r0x4b = _mm_cvtps_epi32(rx4b); ++ r0x4b = av_clip_int16_sse(r0x4b); + g0x4b = _mm_cvtps_epi32(gx4b); ++ g0x4b = av_clip_int16_sse(g0x4b); + b0x4b = _mm_cvtps_epi32(bx4b); ++ b0x4b = av_clip_int16_sse(b0x4b); + + // Reshape y1x4b + ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf); @@ -7341,13 +7370,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); -+ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); -+ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE)); + + r1x4b = _mm_cvtps_epi32(rx4b); ++ r1x4b = av_clip_int16_sse(r1x4b); + g1x4b = _mm_cvtps_epi32(gx4b); ++ g1x4b = av_clip_int16_sse(g1x4b); + b1x4b = _mm_cvtps_epi32(bx4b); ++ b1x4b = av_clip_int16_sse(b1x4b); + + tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -7610,13 +7642,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); -+ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); -+ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE)); + + r0x4a = _mm_cvtps_epi32(rx4a); ++ r0x4a = av_clip_int16_sse(r0x4a); + g0x4a = _mm_cvtps_epi32(gx4a); ++ g0x4a = av_clip_int16_sse(g0x4a); + b0x4a = _mm_cvtps_epi32(bx4a); ++ b0x4a = av_clip_int16_sse(b0x4a); + + // Reshape y1x4a + ia1 = _mm_unpacklo_ps(y1x4af, ux4af); @@ -7650,13 +7685,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); -+ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); -+ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(JPEG_SCALE)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(JPEG_SCALE)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(JPEG_SCALE)); + + r1x4a = _mm_cvtps_epi32(rx4a); ++ r1x4a = av_clip_int16_sse(r1x4a); + g1x4a = _mm_cvtps_epi32(gx4a); ++ g1x4a = av_clip_int16_sse(g1x4a); + b1x4a = _mm_cvtps_epi32(bx4a); ++ b1x4a = av_clip_int16_sse(b1x4a); + + // Reshape y0x4b + ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf); @@ -7690,13 +7728,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); -+ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); -+ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE)); + + r0x4b = _mm_cvtps_epi32(rx4b); ++ r0x4b = av_clip_int16_sse(r0x4b); + g0x4b = _mm_cvtps_epi32(gx4b); ++ g0x4b = av_clip_int16_sse(g0x4b); + b0x4b = _mm_cvtps_epi32(bx4b); ++ b0x4b = av_clip_int16_sse(b0x4b); + + // Reshape y1x4b + ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf); @@ -7730,13 +7771,16 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); + lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); + -+ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); -+ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); -+ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(JPEG_SCALE)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(JPEG_SCALE)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(JPEG_SCALE)); + + r1x4b = _mm_cvtps_epi32(rx4b); ++ r1x4b = av_clip_int16_sse(r1x4b); + g1x4b = _mm_cvtps_epi32(gx4b); ++ g1x4b = av_clip_int16_sse(g1x4b); + b1x4b = _mm_cvtps_epi32(bx4b); ++ b1x4b = av_clip_int16_sse(b1x4b); + + tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -8449,12 +8493,12 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS +} + -+X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V2 void tonemap_frame_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS + uint8_t *rdsty = dsty; @@ -8737,21 +8781,21 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_p010_2_nv12(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS +} + -+X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V2 void tonemap_frame_p010_2_p010_sse(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS + uint16_t *rdsty = dsty; @@ -8954,6 +8998,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); + yoax4 = _mm_srai_epi32(yoax4, out_sh); + yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); ++ yoax4 = _mm_slli_epi32(yoax4, out_sh2); + + yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); + yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); @@ -8961,9 +9006,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); + yobx4 = _mm_srai_epi32(yobx4, out_sh); + yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); ++ yobx4 = _mm_slli_epi32(yobx4, out_sh2); + + y0ox8 = _mm_packus_epi32(yoax4, yobx4); -+ y0ox8 = _mm_slli_epi16(y0ox8, out_sh2); + _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8); + + r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); @@ -8984,6 +9029,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); + y1oax4 = _mm_srai_epi32(y1oax4, out_sh); + y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); ++ y1oax4 = _mm_slli_epi32(y1oax4, out_sh2); + + y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); + y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); @@ -8991,9 +9037,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); + y1obx4 = _mm_srai_epi32(y1obx4, out_sh); + y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); ++ y1obx4 = _mm_slli_epi32(y1obx4, out_sh2); + + y1ox8 = _mm_packus_epi32(y1oax4, y1obx4); -+ y1ox8 = _mm_slli_epi16(y1ox8, out_sh2); + _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8); + + ravgx4 = _mm_hadd_epi32(roax4, robx4); @@ -9025,8 +9071,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + + uvoax4 = _mm_unpacklo_epi32(uoax4, voax4); + uvobx4 = _mm_unpackhi_epi32(uoax4, voax4); ++ uvoax4 = _mm_slli_epi32(uvoax4, out_sh2); ++ uvobx4 = _mm_slli_epi32(uvobx4, out_sh2); + uvox8 = _mm_packus_epi32(uvoax4, uvobx4); -+ uvox8 = _mm_slli_epi16(uvox8, out_sh2); + _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8); + } + } @@ -9038,11 +9085,11 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + rdstuv += offset; + rsrcy += offset; + rsrcuv += offset; -+ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ tonemap_frame_p010_2_p010(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS +} @@ -9104,18 +9151,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h + int width, int height, + const struct TonemapIntParams *params); + -+X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++X86_64_V2 void tonemap_frame_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V2 void tonemap_frame_p010_2_p010_sse(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +#endif // AVFILTER_X86_TONEMAPX_INTRIN_SSE_H