assets/shaders/decode/astc.comp

#version 450
/* Copyright (c) 2020-2024 Hans-Kristian Arntzen
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#extension GL_EXT_samplerless_texture_functions : require
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 4) in;

#include "bitextract.h"

layout(set = 1, binding = 0) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer;
layout(set = 1, binding = 1) uniform utextureBuffer LUTEndpointUnquantize;
layout(set = 1, binding = 2) uniform utextureBuffer LUTWeightQuantizer;
layout(set = 1, binding = 3) uniform utextureBuffer LUTWeightUnquantize;
layout(set = 1, binding = 4) uniform utextureBuffer LUTTritQuintDecode;
layout(set = 1, binding = 5) uniform utexture2D LUTPartitionTable;

layout(set = 0, binding = 0) writeonly uniform uimage2D OutputImage;
layout(set = 0, binding = 1) uniform utexture2D PayloadInput;
layout(constant_id = 2) const bool DECODE_8BIT = false;

const int MODE_LDR = 0;
const int MODE_HDR = 1;
const int MODE_HDR_LDR_ALPHA = 2;

layout(push_constant) uniform Registers
{
    uvec4 error_color;
    ivec2 resolution;
} registers;

void swap(inout int a, inout int b)
{
    int tmp = a;
    a = b;
    b = tmp;
}

ivec4 build_coord()
{
    ivec2 payload_coord = ivec2(gl_WorkGroupID.xy) * 2;
    payload_coord.x += int(gl_LocalInvocationID.z) & 1;
    payload_coord.y += (int(gl_LocalInvocationID.z) >> 1) & 1;
    ivec2 coord = payload_coord * ivec2(gl_WorkGroupSize.xy);
    coord += ivec2(gl_LocalInvocationID.xy);
    return ivec4(coord, payload_coord);
}

ivec4 interpolate_endpoint(ivec4 ep0, ivec4 ep1, ivec4 weight, int decode_mode)
{
    if (decode_mode == MODE_HDR)
    {
        ep0 <<= 4;
        ep1 <<= 4;
    }
    else if (decode_mode == MODE_HDR_LDR_ALPHA)
    {
        ep0.rgb <<= 4;
        ep1.rgb <<= 4;
        ep0.a *= 0x101;
        ep1.a *= 0x101;
    }
    else if (DECODE_8BIT)
    {
        // This isn't quite right in all cases.
        // In normal ASTC with sRGB, the alpha channel is supposed to
        // be decoded as FP16,
        // even when color components are SRGB 8-bit (?!?!?!?!).
        // This is correct if decode_unorm8 mode is used though,
        // for sanity, we're going to assume unorm8 decoding mode
        // is implied when using sRGB.
        ep0 = (ep0 << 8) | ivec4(0x80);
        ep1 = (ep1 << 8) | ivec4(0x80);
    }
    else
    {
        ep0 *= 0x101;
        ep1 *= 0x101;
    }

    ivec4 color = (ep0 * (64 - weight) + ep1 * weight + 32) >> 6;
    return color;
}

bvec4 bvec_or(bvec4 a, bvec4 b)
{
    return bvec4(ivec4(a) | ivec4(b));
}

uint round_down_quantize_fp16(int color)
{
    // ASTC has a very peculiar way of converting the decoded result to FP16.
    // 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)).
    int msb = findMSB(color);
    int shamt = msb;
    int m = ((color << 10) >> shamt) & 0x3ff;
    int e = msb - 1;
    uint decoded = color == 0xffff ? 0x3c00u : (e < 1 ? (color << 8) : (m | (e << 10)));
    return decoded;
}

uvec4 round_down_quantize_fp16(ivec4 color)
{
    // ASTC has a very peculiar way of converting the decoded result to FP16.
    // 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)).
    ivec4 msb = findMSB(color);
    ivec4 shamt = msb;
    ivec4 m = ((color << 10) >> shamt) & 0x3ff;
    ivec4 e = msb - 1;
    uvec4 decoded = m | (e << 10);
    uvec4 denorm_decode = color << 8;
    decoded = mix(decoded, uvec4(denorm_decode), lessThan(e, ivec4(1)));
    decoded = mix(decoded, uvec4(0x3c00), equal(color, ivec4(0xffff)));
    return decoded;
}

uvec4 decode_fp16(ivec4 color, int decode_mode)
{
    if (decode_mode != MODE_LDR)
    {
        // Interpret the value as FP16, but with some extra fixups along the way to make the interpolation more
        // logarithmic (apparently). From spec:
        ivec4 e = color >> 11;
        ivec4 m = color & 0x7ff;
        ivec4 mt = 4 * m - 512;
        mt = mix(mt, ivec4(3 * m), lessThan(m, ivec4(512)));
        mt = mix(mt, ivec4(5 * m - 2048), greaterThanEqual(m, ivec4(1536)));

        ivec4 decoded = (e << 10) + (mt >> 3);
        // +Inf or NaN are decoded to 0x7bff (max finite value).
        decoded = mix(decoded, ivec4(0x7bff), bvec_or(greaterThan(decoded & 0x7fff, ivec4(0x7c00)), equal(decoded, ivec4(0x7c00))));

        if (decode_mode == MODE_HDR_LDR_ALPHA)
            decoded.a = int(round_down_quantize_fp16(color.a));

        return decoded;
    }
    else
    {
        return round_down_quantize_fp16(color);
    }
}

struct BlockMode
{
    ivec2 weight_grid_size;
    int weight_mode_index;
    int num_partitions;
    int seed;
    int cem;
    int config_bits;
    int primary_config_bits;
    bool dual_plane;
    bool void_extent;
};

bool decode_error = false;

BlockMode decode_block_mode(uvec4 payload)
{
    BlockMode mode;
    mode.void_extent = (payload.x & 0x1ffu) == 0x1fcu;
    if (mode.void_extent)
        return mode;

    mode.dual_plane = (payload.x & (1u << 10u)) != 0u;

    uint higher = (payload.x >> 2u) & 3u;
    uint lower = payload.x & 3u;

    if (lower != 0)
    {
        mode.weight_mode_index = int((payload.x >> 4u) & 1u);
        mode.weight_mode_index |= int((payload.x << 1u) & 6u);
        mode.weight_mode_index |= int((payload.x >> 6u) & 8u);

        if (higher < 2u)
        {
            mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 2) + 4 + 4 * higher);
            mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2);
        }
        else if (higher == 2u)
        {
            mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2);
            mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 2) + 8);
        }
        else
        {
            if ((payload.x & (1u << 8u)) != 0u)
            {
                mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 1) + 2);
                mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2);
            }
            else
            {
                mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2);
                mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 1) + 6);
            }
        }
    }
    else
    {
        int p3 = int(bitfieldExtract(payload.x, 9, 1));
        int hi = int(bitfieldExtract(payload.x, 7, 2));
        int lo = int(bitfieldExtract(payload.x, 5, 2));
        if (hi == 0)
        {
            mode.weight_grid_size.x = 12;
            mode.weight_grid_size.y = lo + 2;
        }
        else if (hi == 1)
        {
            mode.weight_grid_size.x = lo + 2;
            mode.weight_grid_size.y = 12;
        }
        else if (hi == 2)
        {
            mode.dual_plane = false;
            p3 = 0;
            mode.weight_grid_size.x = lo + 6;
            mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 9, 2) + 6);
        }
        else
        {
            if (lo == 0)
                mode.weight_grid_size = ivec2(6, 10);
            else if (lo == 1)
                mode.weight_grid_size = ivec2(10, 6);
            else
                decode_error = true;
        }

        int p0 = int(bitfieldExtract(payload.x, 4, 1));
        int p1 = int(bitfieldExtract(payload.x, 2, 1));
        int p2 = int(bitfieldExtract(payload.x, 3, 1));
        mode.weight_mode_index = p0 + (p1 << 1) + (p2 << 2) + (p3 << 3);
    }

    // 11 bits for block mode.
    // 2 bits for partition select
    // If partitions > 1:
    //   4 bits CEM selector
    //   If dual_plane:
    //     2 bits of CCS
    // else:
    //   10 for partition seed
    //   2 bits for CEM main selector
    //   If CEM[1:0] = 00:
    //     4 bits for CEM extra selector if all same type.
    //   else:
    //     (1 + 2) * num_partitions if different types.
    //     First 4 bits are encoded next to CEM[1:0], otherwise, packed before weights.
    //   If dual_plane:
    //     2 bits of CCS before extra CEM bits.
    const int CONFIG_BITS_BLOCK = 11;
    const int CONFIG_BITS_PARTITION_MODE = 2;
    const int CONFIG_BITS_SEED = 10;
    const int CONFIG_BITS_PRIMARY_MULTI_CEM = 2;
    const int CONFIG_BITS_CEM = 4;
    const int CONFIG_BITS_EXTRA_CEM_PER_PARTITION = 3;
    const int CONFIG_BITS_CCS = 2;

    mode.num_partitions = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK, CONFIG_BITS_PARTITION_MODE)) + 1;

    if (mode.num_partitions > 1)
    {
        mode.seed = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_SEED));
        mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED,
                                       CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM));
    }
    else
        mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_CEM));

    int config_bits;
    if (mode.num_partitions > 1)
    {
        bool single_cem = (mode.cem & 3) == 0;
        if (single_cem)
        {
            config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE +
                          CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM;
        }
        else
        {
            config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE +
                          CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM +
                          CONFIG_BITS_EXTRA_CEM_PER_PARTITION * mode.num_partitions;
        }
    }
    else
    {
        config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_CEM;
    }

    // Other config bits are packed before the weights.
    int primary_config_bits;
    if (mode.num_partitions > 1)
    {
        primary_config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED +
                              CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM;
    }
    else
        primary_config_bits = config_bits;

    if (mode.dual_plane)
        config_bits += CONFIG_BITS_CCS;

    // This is not allowed.
    if (any(greaterThan(mode.weight_grid_size, ivec2(gl_WorkGroupSize.xy))))
        decode_error = true;
    if (mode.dual_plane && mode.num_partitions > 3)
        decode_error = true;

    mode.config_bits = config_bits;
    mode.primary_config_bits = primary_config_bits;
    return mode;
}

int idiv3_floor(int v)
{
    return (v * 0x5556) >> 16;
}

int idiv3_ceil(int v)
{
    return idiv3_floor(v + 2);
}

int idiv5_floor(int v)
{
    return (v * 0x3334) >> 16;
}

int idiv5_ceil(int v)
{
    return idiv5_floor(v + 4);
}

uvec4 build_bitmask(int bits)
{
    ivec4 num_bits = ivec4(bits, bits - 32, bits - 64, bits - 96);
    uvec4 mask = uvec4(1) << clamp(num_bits, ivec4(0), ivec4(31));
    mask--;
    mask = mix(mask, uvec4(0xffffffffu), greaterThanEqual(uvec4(bits), uvec4(32, 64, 96, 128)));
    return mask;
}

int decode_integer_sequence(uvec4 payload, int start_bit, int index, ivec3 quant)
{
    int ret;
    if (quant.y != 0)
    {
        // Trit-decoding.
        int block = idiv5_floor(index);
        int offset = index - block * 5;
        start_bit += block * (5 * quant.x + 8);

        int t0_t1_offset = start_bit + (quant.x * 1 + 0);
        int t2_t3_offset = start_bit + (quant.x * 2 + 2);
        int t4_offset    = start_bit + (quant.x * 3 + 4);
        int t5_t6_offset = start_bit + (quant.x * 4 + 5);
        int t7_offset    = start_bit + (quant.x * 5 + 7);

        int t = (extract_bits(payload, t0_t1_offset, 2) << 0) |
                (extract_bits(payload, t2_t3_offset, 2) << 2) |
                (extract_bits(payload, t4_offset, 1) << 4) |
                (extract_bits(payload, t5_t6_offset, 2) << 5) |
                (extract_bits(payload, t7_offset, 1) << 7);

        t = int(texelFetch(LUTTritQuintDecode, t).x);
        t = (t >> (3 * offset)) & 7;

        int m_offset = offset * quant.x;
        m_offset += idiv5_ceil(offset * 8);

        if (quant.x != 0)
        {
            int m = extract_bits(payload, m_offset + start_bit, quant.x);
            ret = (t << quant.x) | m;
        }
        else
            ret = t;
    }
    else if (quant.z != 0)
    {
        // Quint-decoding
        int block = idiv3_floor(index);
        int offset = index - block * 3;
        start_bit += block * (3 * quant.x + 7);

        int q0_q1_q2_offset = start_bit + (quant.x * 1 + 0);
        int q3_q4_offset    = start_bit + (quant.x * 2 + 3);
        int q5_q6_offset    = start_bit + (quant.x * 3 + 5);

        int q = (extract_bits(payload, q0_q1_q2_offset, 3) << 0) |
                (extract_bits(payload, q3_q4_offset, 2) << 3) |
                (extract_bits(payload, q5_q6_offset, 2) << 5);

        q = int(texelFetch(LUTTritQuintDecode, 256 + q).x);
        q = (q >> (3 * offset)) & 7;

        int m_offset = offset * quant.x;
        m_offset += idiv3_ceil(offset * 7);

        if (quant.x != 0)
        {
            int m = extract_bits(payload, m_offset + start_bit, quant.x);
            ret = (q << quant.x) | m;
        }
        else
            ret = q;
    }
    else
    {
        int bit = index * quant.x;
        ret = extract_bits(payload, start_bit + bit, quant.x);
    }
    return ret;
}

ivec2 normalize_coord(ivec2 pixel_coord)
{
    // This resolves to a compile-time constant.
    const ivec2 D = ivec2((vec2((1024 + ivec2(gl_WorkGroupSize.xy >> 1))) + 0.5) / vec2(gl_WorkGroupSize.xy - 1));
    ivec2 c = D * pixel_coord;
    return c;
}

int decode_weight(uvec4 payload, int weight_index, ivec4 quant)
{
    int primary_weight = decode_integer_sequence(payload, 0, weight_index, quant.xyz);
    primary_weight = int(texelFetch(LUTWeightUnquantize, primary_weight + quant.w).x);
    return primary_weight;
}

int decode_weight_bilinear(uvec4 payload, ivec2 coord, int weight_resolution,
                           int stride, int offset, ivec2 fractional, ivec4 quant)
{
    int index = coord.y * weight_resolution + coord.x;
    int p00 = decode_weight(payload, stride * index + offset, quant);
    int p10, p01, p11;

    if (fractional.x != 0)
        p10 = decode_weight(payload, stride * (index + 1) + offset, quant);
    else
        p10 = p00;

    if (fractional.y != 0)
    {
        p01 = decode_weight(payload, stride * (index + weight_resolution) + offset, quant);
        if (fractional.x != 0)
            p11 = decode_weight(payload, stride * (index + weight_resolution + 1) + offset, quant);
        else
            p11 = p01;
    }
    else
    {
        p01 = p00;
        p11 = p10;
    }

    int w11 = (fractional.x * fractional.y + 8) >> 4;
    int w10 = fractional.x - w11;
    int w01 = fractional.y - w11;
    int w00 = 16 - fractional.x - fractional.y + w11;
    return (p00 * w00 + p10 * w10 + p01 * w01 + p11 * w11 + 8) >> 4;
}

ivec4 decode_weights(uvec4 payload, BlockMode mode, ivec2 normalized_pixel, out int weight_cost_bits)
{
    ivec4 quant = ivec4(texelFetch(LUTWeightQuantizer, mode.weight_mode_index));
    int num_weights = mode.weight_grid_size.x * mode.weight_grid_size.y;
    num_weights <<= int(mode.dual_plane);
    weight_cost_bits =
        quant.x * num_weights +
        idiv5_ceil(num_weights * 8 * quant.y) +
        idiv3_ceil(num_weights * 7 * quant.z);

    // Decoders must deal with error conditions and return the correct error color.
    if (weight_cost_bits < 24 || weight_cost_bits > 96 || num_weights > 64)
    {
        decode_error = true;
        return ivec4(0);
    }

    int ccs;
    if (mode.dual_plane)
    {
        int extra_cem_bits = 0;
        if ((mode.cem & 3) != 0)
            extra_cem_bits = max(mode.num_partitions * 3 - 4, 0);
        ccs = extract_bits(payload, 126 - weight_cost_bits - extra_cem_bits, 2);
    }

    payload = bitfieldReverse(payload);
    payload = payload.wzyx;
    payload &= build_bitmask(weight_cost_bits);

    // Scale the normalized coordinate to weight grid.
    ivec2 weight_pixel_fixed_point = (normalized_pixel * (mode.weight_grid_size - 1) + 32) >> 6;
    ivec2 weight_pixel = weight_pixel_fixed_point >> 4;
    ivec2 weight_pixel_fractional = weight_pixel_fixed_point & 0xf;

    ivec4 ret;
    int primary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x,
                                                1 << int(mode.dual_plane), 0,
                                                weight_pixel_fractional, quant);
    if (mode.dual_plane)
    {
        int secondary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x,
                                                      2, 1,
                                                      weight_pixel_fractional, quant);
        ret = mix(ivec4(primary_weight), ivec4(secondary_weight), equal(ivec4(ccs), ivec4(0, 1, 2, 3)));
    }
    else
        ret = ivec4(primary_weight);

    return ret;
}

void decode_endpoint_ldr_luma_direct(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1)
{
    ep0 = ivec4(ivec3(v0), 0xff);
    ep1 = ivec4(ivec3(v1), 0xff);
}

void decode_endpoint_hdr_luma_direct(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1)
{
    int y0, y1;
    if (v1 >= v0)
    {
        y0 = v0 << 4;
        y1 = v1 << 4;
    }
    else
    {
        y0 = (v1 << 4) + 8;
        y1 = (v0 << 4) - 8;
    }

    ep0 = ivec4(ivec3(y0), 0x780);
    ep1 = ivec4(ivec3(y1), 0x780);
}

void decode_endpoint_hdr_luma_direct_small_range(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1)
{
    int y0, y1, d;

    if ((v0 & 0x80) != 0)
    {
        y0 = ((v1 & 0xe0) << 4) | ((v0 & 0x7f) << 2);
        d = (v1 & 0x1f) << 2;
    }
    else
    {
        y0 = ((v1 & 0xf0) << 4) | ((v0 & 0x7f) << 1);
        d = (v1 & 0x0f)  << 1;
    }

    y1 = min(y0 + d, 0xfff);

    ep0 = ivec4(ivec3(y0), 0x780);
    ep1 = ivec4(ivec3(y1), 0x780);
}

void decode_endpoint_ldr_luma_base_offset(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1)
{
    int l0 = (v0 >> 2) | (v1 & 0xc0);
    int l1 = l0 + (v1 & 0x3f);
    l1 = min(l1, 0xff);
    ep0 = ivec4(ivec3(l0), 0xff);
    ep1 = ivec4(ivec3(l1), 0xff);
}

void decode_endpoint_ldr_luma_alpha_direct(out ivec4 ep0, out ivec4 ep1,
    int v0, int v1, int v2, int v3)
{
    ep0 = ivec4(ivec3(v0), v2);
    ep1 = ivec4(ivec3(v1), v3);
}

ivec4 blue_contract(int r, int g, int b, int a)
{
    ivec4 ret;
    ret.r = (r + b) >> 1;
    ret.g = (g + b) >> 1;
    ret.b = b;
    ret.a = a;
    return ret;
}

void bit_transfer_signed(inout int a, inout int b)
{
    b >>= 1;
    b |= a & 0x80;
    a >>= 1;
    a &= 0x3f;
    a = bitfieldExtract(a, 0, 6);
}

void decode_endpoint_ldr_luma_alpha_base_offset(out ivec4 ep0, out ivec4 ep1,
    int v0, int v1, int v2, int v3)
{
    bit_transfer_signed(v1, v0);
    bit_transfer_signed(v3, v2);
    int v0_v1 = clamp(v0 + v1, 0, 0xff);
    int v2_v3 = clamp(v2 + v3, 0, 0xff);
    v0 = clamp(v0, 0, 0xff);
    v2 = clamp(v2, 0, 0xff);
    ep0 = ivec4(ivec3(v0), v2);
    ep1 = ivec4(ivec3(v0_v1), v2_v3);
}

void decode_endpoint_ldr_rgb_base_scale(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3)
{
    ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, 0xff);
    ep1 = ivec4(v0, v1, v2, 0xff);
}

void decode_endpoint_ldr_rgb_base_scale_two_a(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3, int v4, int v5)
{
    ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, v4);
    ep1 = ivec4(v0, v1, v2, v5);
}

void decode_endpoint_ldr_rgb_direct(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3, int v4, int v5)
{
    int s0 = v0 + v2 + v4;
    int s1 = v1 + v3 + v5;
    if (s1 >= s0)
    {
        ep0 = ivec4(v0, v2, v4, 0xff);
        ep1 = ivec4(v1, v3, v5, 0xff);
    }
    else
    {
        ep0 = blue_contract(v1, v3, v5, 0xff);
        ep1 = blue_contract(v0, v2, v4, 0xff);
    }
}

void decode_endpoint_hdr_rgb_scale(out ivec4 ep0, out ivec4 ep1,
    int v0, int v1, int v2, int v3)
{
    // Mind-numbing weird format, just copy from spec ...
    int mode_value = ((v0 & 0xc0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
    int major_component;
    int mode;

    if ((mode_value & 0xc) != 0xc)
    {
        major_component = mode_value >> 2;
        mode = mode_value & 3;
    }
    else if (mode_value != 0xf)
    {
        major_component = mode_value & 3;
        mode = 4;
    }
    else
    {
        major_component = 0;
        mode = 5;
    }

    int red = v0 & 0x3f;
    int green = v1 & 0x1f;
    int blue = v2 & 0x1f;
    int scale = v3 & 0x1f;

    int x0 = (v1 >> 6) & 1;
    int x1 = (v1 >> 5) & 1;
    int x2 = (v2 >> 6) & 1;
    int x3 = (v2 >> 5) & 1;
    int x4 = (v3 >> 7) & 1;
    int x5 = (v3 >> 6) & 1;
    int x6 = (v3 >> 5) & 1;

    int ohm = 1 << mode;
    if ((ohm & 0x30) != 0) green |= x0 << 6;
    if ((ohm & 0x3a) != 0) green |= x1 << 5;
    if ((ohm & 0x30) != 0) blue |= x2 << 6;
    if ((ohm & 0x3a) != 0) blue |= x3 << 5;
    if ((ohm & 0x3d) != 0) scale |= x6 << 5;
    if ((ohm & 0x2d) != 0) scale |= x5 << 6;
    if ((ohm & 0x04) != 0) scale |= x4 << 7;
    if ((ohm & 0x3b) != 0) red |= x4 << 6;
    if ((ohm & 0x04) != 0) red |= x3 << 6;
    if ((ohm & 0x10) != 0) red |= x5 << 7;
    if ((ohm & 0x0f) != 0) red |= x2 << 7;
    if ((ohm & 0x05) != 0) red |= x1 << 8;
    if ((ohm & 0x0a) != 0) red |= x0 << 8;
    if ((ohm & 0x05) != 0) red |= x0 << 9;
    if ((ohm & 0x02) != 0) red |= x6 << 9;
    if ((ohm & 0x01) != 0) red |= x3 << 10;
    if ((ohm & 0x02) != 0) red |= x5 << 10;

    int shamt = max(mode, 1);
    red <<= shamt;
    green <<= shamt;
    blue <<= shamt;
    scale <<= shamt;

    if (mode != 5)
    {
        green = red - green;
        blue = red - blue;
    }

    if (major_component == 1)
        swap(red, green);
    else if (major_component == 2)
        swap(red, blue);

    ep1 = ivec4(clamp(ivec3(red, green, blue), ivec3(0), ivec3(0xfff)), 0x780);
    ep0 = ivec4(clamp(ivec3(red, green, blue) - scale, ivec3(0), ivec3(0xfff)), 0x780);
}

void decode_endpoint_hdr_rgb_direct(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3, int v4, int v5)
{
    int major_component = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6);

    if (major_component == 3)
    {
        ep0 = ivec4(v0 << 4, v2 << 4, (v4 & 0x7f) << 5, 0x780);
        ep1 = ivec4(v1 << 4, v3 << 4, (v5 & 0x7f) << 5, 0x780);
        return;
    }

    int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5);
    int va = v0 | ((v1 & 0x40) << 2);
    int vb0 = v2 & 0x3f;
    int vb1 =  v3 & 0x3f;
    int vc = v1 & 0x3f;
    int vd0 = v4 & 0x7f;
    int vd1 = v5 & 0x7f;

    int d_bits = 7 - (mode & 1);
    if ((mode & 5) == 4)
        d_bits -= 2;

    vd0 = bitfieldExtract(vd0, 0, d_bits);
    vd1 = bitfieldExtract(vd1, 0, d_bits);

    int x0 = (v2 >> 6) & 1;
    int x1 = (v3 >> 6) & 1;
    int x2 = (v4 >> 6) & 1;
    int x3 = (v5 >> 6) & 1;
    int x4 = (v4 >> 5) & 1;
    int x5 = (v5 >> 5) & 1;

    int ohm = 1 << mode;
    if ((ohm & 0xa4) != 0) va |= x0 << 9;
    if ((ohm & 0x08) != 0) va |= x2 << 9;
    if ((ohm & 0x50) != 0) va |= x4 << 9;
    if ((ohm & 0x50) != 0) va |= x5 << 10;
    if ((ohm & 0xa0) != 0) va |= x1 << 10;
    if ((ohm & 0xc0) != 0) va |= x2 << 11;

    if ((ohm & 0x04) != 0) vc |= x1 << 6;
    if ((ohm & 0xe8) != 0) vc |= x3 << 6;
    if ((ohm & 0x20) != 0) vc |= x2 << 7;

    if ((ohm & 0x5b) != 0) vb0 |= x0 << 6;
    if ((ohm & 0x5b) != 0) vb1 |= x1 << 6;
    if ((ohm & 0x12) != 0) vb0 |= x2 << 7;
    if ((ohm & 0x12) != 0) vb1 |= x3 << 7;

    int shamt = (mode >> 1) ^ 3;
    va <<= shamt;
    vb0 <<= shamt;
    vb1 <<= shamt;
    vc <<= shamt;
    vd0 <<= shamt;
    vd1 <<= shamt;

    ep1 = ivec4(clamp(ivec3(va, va - vb0, va - vb1), ivec3(0), ivec3(0xfff)), 0x780);
    ep0 = ivec4(clamp(ivec3(va - vc, va - vb0 - vc - vd0, va - vb1 - vc - vd1), ivec3(0), ivec3(0xfff)), 0x780);

    if (major_component == 1)
    {
        swap(ep0.r, ep0.g);
        swap(ep1.r, ep1.g);
    }
    else if (major_component == 2)
    {
        swap(ep0.r, ep0.b);
        swap(ep1.r, ep1.b);
    }
}

void decode_endpoint_ldr_rgb_base_offset(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3, int v4, int v5)
{
    bit_transfer_signed(v1, v0);
    bit_transfer_signed(v3, v2);
    bit_transfer_signed(v5, v4);
    if (v1 + v3 + v5 >= 0)
    {
        ep0 = ivec4(v0, v2, v4, 0xff);
        ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, 0xff);
    }
    else
    {
        ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xff);
        ep1 = blue_contract(v0, v2, v4, 0xff);
    }

    ep0.rgb = clamp(ep0.rgb, ivec3(0), ivec3(0xff));
    ep1.rgb = clamp(ep1.rgb, ivec3(0), ivec3(0xff));
}

void decode_endpoint_ldr_rgba_direct(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3,
        int v4, int v5, int v6, int v7)
{
    int s0 = v0 + v2 + v4;
    int s1 = v1 + v3 + v5;
    if (s1 >= s0)
    {
        ep0 = ivec4(v0, v2, v4, v6);
        ep1 = ivec4(v1, v3, v5, v7);
    }
    else
    {
        ep0 = blue_contract(v1, v3, v5, v7);
        ep1 = blue_contract(v0, v2, v4, v6);
    }
}

void decode_endpoint_ldr_rgba_base_offset(out ivec4 ep0, out ivec4 ep1,
        int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
{
    bit_transfer_signed(v1, v0);
    bit_transfer_signed(v3, v2);
    bit_transfer_signed(v5, v4);
    bit_transfer_signed(v7, v6);

    if (v1 + v3 + v5 >= 0)
    {
        ep0 = ivec4(v0, v2, v4, v6);
        ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, v6 + v7);
    }
    else
    {
        ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7);
        ep1 = blue_contract(v0, v2, v4, v6);
    }

    ep0 = clamp(ep0, ivec4(0), ivec4(0xff));
    ep1 = clamp(ep1, ivec4(0), ivec4(0xff));
}

void decode_endpoint_hdr_alpha(out int ep0, out int ep1, int v6, int v7)
{
    int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
    v6 &= 0x7f;
    v7 &= 0x7f;

    if (mode == 3)
    {
        ep0 = v6 << 5;
        ep1 = v7 << 5;
    }
    else
    {
        v6 |= (v7 << (mode + 1)) & 0x780;
        v7 &= 0x3f >> mode;
        v7 ^= 0x20 >> mode;
        v7 -= 0x20 >> mode;
        v6 <<= 4 - mode;
        v7 <<= 4 - mode;
        v7 += v6;
        v7 = clamp(v7, 0, 0xfff);
        ep0 = v6;
        ep1 = v7;
    }
}

void decode_endpoint(out ivec4 ep0, out ivec4 ep1, out int decode_mode,
                     uvec4 payload, int bit_offset, ivec4 quant, int ep_mode,
                     int base_endpoint_index, int num_endpoint_bits)
{
    num_endpoint_bits += bit_offset;
    payload &= build_bitmask(num_endpoint_bits);

    // Could of course use an array, but that doesn't lower nicely to indexed registers on all GPUs.
    int v0, v1, v2, v3, v4, v5, v6, v7;
    int num_values = 2 * ((ep_mode >> 2) + 1);

#define DECODE_EP(i) \
    int(texelFetch(LUTEndpointUnquantize, quant.w + decode_integer_sequence(payload, bit_offset, i + base_endpoint_index, quant.xyz)).x)

    int hi_bits = ep_mode >> 2;
    v0 = DECODE_EP(0);
    v1 = DECODE_EP(1);

    if (hi_bits >= 1)
    {
        v2 = DECODE_EP(2);
        v3 = DECODE_EP(3);
    }

    if (hi_bits >= 2)
    {
        v4 = DECODE_EP(4);
        v5 = DECODE_EP(5);
    }

    if (hi_bits >= 3)
    {
        v6 = DECODE_EP(6);
        v7 = DECODE_EP(7);
    }

    switch (ep_mode)
    {
    case 0:
        decode_endpoint_ldr_luma_direct(ep0, ep1,
            v0, v1);
        decode_mode = MODE_LDR;
        break;

    case 1:
        decode_endpoint_ldr_luma_base_offset(ep0, ep1,
            v0, v1);
        decode_mode = MODE_LDR;
        break;

    case 2:
        decode_endpoint_hdr_luma_direct(ep0, ep1,
            v0, v1);
        decode_mode = MODE_HDR;
        break;

    case 3:
        decode_endpoint_hdr_luma_direct_small_range(ep0, ep1,
            v0, v1);
        decode_mode = MODE_HDR;
        break;

    case 4:
        decode_endpoint_ldr_luma_alpha_direct(ep0, ep1,
            v0, v1, v2, v3);
        decode_mode = MODE_LDR;
        break;

    case 5:
        decode_endpoint_ldr_luma_alpha_base_offset(ep0, ep1,
            v0, v1, v2, v3);
        decode_mode = MODE_LDR;
        break;

    case 6:
        decode_endpoint_ldr_rgb_base_scale(ep0, ep1,
            v0, v1, v2, v3);
        decode_mode = MODE_LDR;
        break;

    case 7:
        decode_endpoint_hdr_rgb_scale(ep0, ep1,
            v0, v1, v2, v3);
        decode_mode = MODE_HDR;
        break;

    case 8:
        decode_endpoint_ldr_rgb_direct(ep0, ep1,
            v0, v1, v2, v3, v4, v5);
        decode_mode = MODE_LDR;
        break;

    case 9:
        decode_endpoint_ldr_rgb_base_offset(ep0, ep1,
            v0, v1, v2, v3, v4, v5);
        decode_mode = MODE_LDR;
        break;

    case 10:
        decode_endpoint_ldr_rgb_base_scale_two_a(ep0, ep1,
            v0, v1, v2, v3, v4, v5);
        decode_mode = MODE_LDR;
        break;

    case 11:
    case 14:
    case 15:
        decode_endpoint_hdr_rgb_direct(ep0, ep1,
            v0, v1, v2, v3, v4, v5);
        if (ep_mode == 14)
        {
            ep0.a = v6;
            ep1.a = v7;
            decode_mode = MODE_HDR_LDR_ALPHA;
        }
        else if (ep_mode == 15)
        {
            decode_endpoint_hdr_alpha(ep0.a, ep1.a, v6, v7);
            decode_mode = MODE_HDR;
        }
        else
            decode_mode = MODE_HDR;
        break;

    case 12:
        decode_endpoint_ldr_rgba_direct(ep0, ep1,
            v0, v1, v2, v3, v4, v5, v6, v7);
        decode_mode = MODE_LDR;
        break;

    case 13:
        decode_endpoint_ldr_rgba_base_offset(ep0, ep1,
            v0, v1, v2, v3, v4, v5, v6, v7);
        decode_mode = MODE_LDR;
        break;
    }

    if (DECODE_8BIT && decode_mode != MODE_LDR)
        decode_error = true;
}

#define CHECK_DECODE_ERROR() do { \
    if (decode_error) \
    { \
        emit_decode_error(coord.xy); \
        return; \
    } \
} while(false)

void emit_decode_error(ivec2 coord)
{
    imageStore(OutputImage, coord, registers.error_color);
}

int compute_num_endpoint_pairs(int num_partitions, int cem)
{
    int ret;
    if (num_partitions > 1)
    {
        bool single_cem = (cem & 3) == 0;
        if (single_cem)
            ret = ((cem >> 4) + 1) * num_partitions;
        else
            ret = (cem & 3) * num_partitions + bitCount(bitfieldExtract(uint(cem), 2, num_partitions));
    }
    else
    {
        ret = (cem >> 2) + 1;
    }
    return ret;
}

void decode_cem_base_endpoint(uvec4 payload, int weight_cost_bits, inout int cem, out int base_endpoint_index,
    int num_partitions, int partition_index)
{
    if (num_partitions > 1)
    {
        bool single_cem = (cem & 3) == 0;
        if (single_cem)
        {
            cem >>= 2;
            base_endpoint_index = ((cem >> 2) + 1) * partition_index;
        }
        else
        {
            if (partition_index != 0)
                base_endpoint_index = (cem & 3) * partition_index + bitCount(bitfieldExtract(uint(cem), 2, partition_index));
            else
                base_endpoint_index = 0;

            int base_class = (cem & 3) - 1;
            int extra_cem_bits = num_partitions * 3 - 4;
            int extra_bits = extract_bits(payload, 128 - weight_cost_bits - extra_cem_bits, extra_cem_bits);
            cem = (extra_bits << 4) | (cem >> 2);

            int class_offset_bit = (cem >> partition_index) & 1;
            int ep_bits = (cem >> (num_partitions + 2 * partition_index)) & 3;

            cem = 4 * (base_class + class_offset_bit) + ep_bits;
        }
        base_endpoint_index *= 2;
    }
    else
    {
        base_endpoint_index = 0;
    }
}

ivec4 void_extent_color(uvec4 payload, out int decode_mode)
{
    int min_s = extract_bits(payload, 12, 13);
    int max_s = extract_bits(payload, 12 + 13, 13);
    int min_t = extract_bits(payload, 12 + 2 * 13, 13);
    int max_t = extract_bits(payload, 12 + 3 * 13, 13);

    int reserved = extract_bits(payload, 10, 2);
    if (reserved != 3)
    {
        decode_error = true;
        return ivec4(0);
    }

    if (!all(equal(ivec4(min_s, max_s, min_t, max_t), ivec4((1 << 13) - 1))))
    {
        if (any(greaterThanEqual(ivec2(min_s, min_t), ivec2(max_s, max_t))))
        {
            decode_error = true;
            return ivec4(0);
        }
    }

    decode_mode = (payload.x & (1u << 9)) != 0u ? MODE_HDR : MODE_LDR;

    int r = extract_bits(payload, 64, 16);
    int g = extract_bits(payload, 64 + 16, 16);
    int b = extract_bits(payload, 64 + 32, 16);
    int a = extract_bits(payload, 64 + 48, 16);

    return ivec4(r, g, b, a);
}

void main()
{
    ivec4 coord = build_coord();
    if (any(greaterThanEqual(coord.xy, registers.resolution)))
        return;

    ivec2 pixel_coord = ivec2(gl_LocalInvocationID.xy);
    int linear_pixel = int(gl_WorkGroupSize.x) * pixel_coord.y + pixel_coord.x;
    uvec4 payload = texelFetch(PayloadInput, coord.zw, 0);

    BlockMode block_mode = decode_block_mode(payload);
    CHECK_DECODE_ERROR();

    ivec4 final_color;
    int decode_mode;
    if (block_mode.void_extent)
    {
        final_color = void_extent_color(payload, decode_mode);
        CHECK_DECODE_ERROR();
    }
    else
    {
        int weight_cost_bits;
        ivec4 weights = decode_weights(payload, block_mode, normalize_coord(pixel_coord), weight_cost_bits);

        int partition_index = 0;
        if (block_mode.num_partitions > 1)
        {
            int lut_x = pixel_coord.x + int(gl_WorkGroupSize.x) * (block_mode.seed & 31);
            int lut_y = pixel_coord.y + int(gl_WorkGroupSize.y) * (block_mode.seed >> 5);
            partition_index = int(texelFetch(LUTPartitionTable, ivec2(lut_x, lut_y), 0).x);
            partition_index = (partition_index >> (2 * block_mode.num_partitions - 4)) & 3;
        }

        int available_endpoint_bits = max(128 - block_mode.config_bits - weight_cost_bits, 0);

        // In multi-partition mode, the 6-bit CEM field is encoded as
        // First two bits tell if all CEM field are the same, if not we specify a class offset, and N bits
        // after that will offset the class by 1.
        int num_endpoint_pairs = compute_num_endpoint_pairs(block_mode.num_partitions, block_mode.cem);

        // Error color must be emitted if we need more than 18 integer sequence encoded values of color.
        if (num_endpoint_pairs > 9)
        {
            decode_error = true;
            emit_decode_error(coord.xy);
            return;
        }

        ivec4 endpoint_quant = ivec4(texelFetch(LUTRemainingBitsToEndpointQuantizer,
                128 * (num_endpoint_pairs - 1) + available_endpoint_bits));

        // Only read the bits we need for endpoints.
        int num_endpoint_values = num_endpoint_pairs * 2;
        available_endpoint_bits =
            endpoint_quant.x * num_endpoint_values +
            idiv5_ceil(endpoint_quant.y * 8 * num_endpoint_values) +
            idiv3_ceil(endpoint_quant.z * 7 * num_endpoint_values);

        // No space left for color endpoints.
        if (all(equal(endpoint_quant.xyz, ivec3(0))))
        {
            decode_error = true;
            emit_decode_error(coord.xy);
            return;
        }

        int endpoint_bit_offset = block_mode.primary_config_bits;
        ivec4 ep0, ep1;

        // Decode CEM for multi-partition schemes.
        int cem = block_mode.cem;
        int base_endpoint_index;
        decode_cem_base_endpoint(payload, weight_cost_bits, cem, base_endpoint_index,
                                 block_mode.num_partitions, partition_index);

        decode_endpoint(ep0, ep1, decode_mode, payload, endpoint_bit_offset, endpoint_quant,
                        cem, base_endpoint_index, available_endpoint_bits);
        CHECK_DECODE_ERROR();

        final_color = interpolate_endpoint(ep0, ep1, weights, decode_mode);
    }

    if (DECODE_8BIT)
    {
        imageStore(OutputImage, coord.xy, uvec4(final_color >> 8));
    }
    else
    {
        uvec4 encoded;
        if (block_mode.void_extent && decode_mode == MODE_HDR)
            encoded = final_color;
        else
            encoded = decode_fp16(final_color, decode_mode);
        imageStore(OutputImage, coord.xy, encoded);
    }
}