diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp index eb3e4598407374..f1dcbe488015e6 100644 --- a/deps/simdutf/simdutf.cpp +++ b/deps/simdutf/simdutf.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2024-12-10 14:54:53 -0500. Do not edit! */ +/* auto-generated on 2024-12-17 14:54:59 -0500. Do not edit! */ /* begin file src/simdutf.cpp */ #include "simdutf.h" // We include base64_tables once. @@ -10019,6 +10019,9 @@ base64_tail_decode(char *dst, const char_type *src, size_t length, const char_type *srcend = src + length; const char_type *srcinit = src; const char *dstinit = dst; + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); uint32_t x; size_t idx; @@ -10038,27 +10041,52 @@ base64_tail_decode(char *dst, const char_type *src, size_t length, } idx = 0; // we need at least four characters. - while (idx < 4 && src < srcend) { +#ifdef __clang__ + // If possible, we read four characters at a time. (It is an optimization.) + if (ignore_garbage && src + 4 <= srcend) { + char_type c0 = src[0]; + char_type c1 = src[1]; + char_type c2 = src[2]; + char_type c3 = src[3]; + uint8_t code0 = to_base64[uint8_t(c0)]; + uint8_t code1 = to_base64[uint8_t(c1)]; + uint8_t code2 = to_base64[uint8_t(c2)]; + uint8_t code3 = to_base64[uint8_t(c3)]; + buffer[idx] = code0; + idx += (is_eight_byte(c0) && code0 <= 63); + buffer[idx] = code1; + idx += (is_eight_byte(c1) && code1 <= 63); + buffer[idx] = code2; + idx += (is_eight_byte(c2) && code2 <= 63); + buffer[idx] = code3; + idx += (is_eight_byte(c3) && code3 <= 63); + src += 4; + } +#endif + while ((idx < 4) && (src < srcend)) { char_type c = *src; uint8_t code = to_base64[uint8_t(c)]; buffer[idx] = uint8_t(code); if (is_eight_byte(c) && code <= 63) { idx++; - } else if (code > 64 || !scalar::base64::is_eight_byte(c)) { + } else if (!ignore_garbage && + (code > 64 || !scalar::base64::is_eight_byte(c))) { return {INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit)}; } else { - // We have a space or a newline. We ignore it. + // We have a space or a newline or garbage. We ignore it. } src++; } if (idx != 4) { - if (last_chunk_options == last_chunk_handling_options::strict && + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict && (idx != 1) && ((idx + padded_characters) & 3) != 0) { // The partial chunk was at src - idx return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), size_t(dst - dstinit)}; - } else if (last_chunk_options == + } else if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::stop_before_partial && (idx != 1) && ((idx + padded_characters) & 3) != 0) { // Rewind src to before partial chunk @@ -10068,7 +10096,8 @@ base64_tail_decode(char *dst, const char_type *src, size_t length, if (idx == 2) { uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6); - if ((last_chunk_options == last_chunk_handling_options::strict) && + if (!ignore_garbage && + (last_chunk_options == last_chunk_handling_options::strict) && (triple & 0xffff)) { return {BASE64_EXTRA_BITS, size_t(src - srcinit), size_t(dst - dstinit)}; @@ -10086,7 +10115,8 @@ base64_tail_decode(char *dst, const char_type *src, size_t length, uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) + (uint32_t(buffer[2]) << 1 * 6); - if ((last_chunk_options == last_chunk_handling_options::strict) && + if (!ignore_garbage && + (last_chunk_options == last_chunk_handling_options::strict) && (triple & 0xff)) { return {BASE64_EXTRA_BITS, size_t(src - srcinit), size_t(dst - dstinit)}; @@ -10100,7 +10130,7 @@ base64_tail_decode(char *dst, const char_type *src, size_t length, std::memcpy(dst, &triple, 2); } dst += 2; - } else if (idx == 1) { + } else if (!ignore_garbage && idx == 1) { return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), size_t(dst - dstinit)}; } @@ -10154,6 +10184,9 @@ result base64_tail_decode_safe( const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3; + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); const char_type *srcend = src + length; const char_type *srcinit = src; @@ -10184,6 +10217,28 @@ result base64_tail_decode_safe( idx = 0; const char_type *srccur = src; // We need at least four characters. +#ifdef __clang__ + // If possible, we read four characters at a time. (It is an optimization.) + if (ignore_garbage && src + 4 <= srcend) { + char_type c0 = src[0]; + char_type c1 = src[1]; + char_type c2 = src[2]; + char_type c3 = src[3]; + uint8_t code0 = to_base64[uint8_t(c0)]; + uint8_t code1 = to_base64[uint8_t(c1)]; + uint8_t code2 = to_base64[uint8_t(c2)]; + uint8_t code3 = to_base64[uint8_t(c3)]; + buffer[idx] = code0; + idx += (is_eight_byte(c0) && code0 <= 63); + buffer[idx] = code1; + idx += (is_eight_byte(c1) && code1 <= 63); + buffer[idx] = code2; + idx += (is_eight_byte(c2) && code2 <= 63); + buffer[idx] = code3; + idx += (is_eight_byte(c3) && code3 <= 63); + src += 4; + } +#endif while (idx < 4 && src < srcend) { char_type c = *src; uint8_t code = to_base64[uint8_t(c)]; @@ -10191,22 +10246,25 @@ result base64_tail_decode_safe( buffer[idx] = uint8_t(code); if (is_eight_byte(c) && code <= 63) { idx++; - } else if (code > 64 || !scalar::base64::is_eight_byte(c)) { + } else if (!ignore_garbage && + (code > 64 || !scalar::base64::is_eight_byte(c))) { outlen = size_t(dst - dstinit); srcr = src; return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } else { - // We have a space or a newline. We ignore it. + // We have a space or a newline or garbage. We ignore it. } src++; } if (idx != 4) { - if (last_chunk_options == last_chunk_handling_options::strict && + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict && ((idx + padded_characters) & 3) != 0) { outlen = size_t(dst - dstinit); srcr = src; return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)}; - } else if (last_chunk_options == + } else if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::stop_before_partial && ((idx + padded_characters) & 3) != 0) { // Rewind src to before partial chunk @@ -10219,7 +10277,7 @@ result base64_tail_decode_safe( outlen = size_t(dst - dstinit); srcr = src; return {SUCCESS, size_t(dst - dstinit)}; - } else if (idx == 1) { + } else if (!ignore_garbage && idx == 1) { // Error: Incomplete chunk of length 1 is invalid in loose mode outlen = size_t(dst - dstinit); srcr = src; @@ -10235,7 +10293,8 @@ result base64_tail_decode_safe( uint32_t triple = 0; if (idx == 2) { triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12); - if ((last_chunk_options == last_chunk_handling_options::strict) && + if (!ignore_garbage && + (last_chunk_options == last_chunk_handling_options::strict) && (triple & 0xffff)) { srcr = src; return {BASE64_EXTRA_BITS, size_t(src - srcinit)}; @@ -10247,7 +10306,8 @@ result base64_tail_decode_safe( } else if (idx == 3) { triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6); - if ((last_chunk_options == last_chunk_handling_options::strict) && + if (!ignore_garbage && + (last_chunk_options == last_chunk_handling_options::strict) && (triple & 0xff)) { srcr = src; return {BASE64_EXTRA_BITS, size_t(src - srcinit)}; @@ -18540,7 +18600,7 @@ void base64_decode_block(char *out, const char *src) { vst3q_u8((uint8_t *)out, outvec); } -template +template full_result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options, @@ -18571,7 +18631,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, } } if (srclen == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -18592,7 +18652,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, bool error = false; uint64_t badcharmask = to_base64_mask(&b, &error); if (badcharmask) { - if (error) { + if (error && !ignore_garbage) { src -= 64; while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { @@ -18636,7 +18696,8 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (!scalar::base64::is_eight_byte(*src) || val > 64) { + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit)}; } @@ -18678,8 +18739,14 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, // backtrack int leftover = int(bufferptr - buffer_start); while (leftover > 0) { - while (to_base64[uint8_t(*(src - 1))] == 64) { - src--; + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } src--; leftover--; @@ -18696,7 +18763,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, r.output_count += size_t(dst - dstinit); } if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -18706,7 +18773,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, } return r; } - if (equalsigns > 0) { + if (equalsigns > 0 && !ignore_garbage) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; @@ -22186,21 +22253,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( @@ -22211,21 +22302,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::base64_length_from_binary( @@ -22735,6 +22850,9 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -22757,7 +22875,7 @@ simdutf_warn_unused result implementation::base64_to_binary( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation}; } return {SUCCESS, 0}; @@ -22765,7 +22883,7 @@ simdutf_warn_unused result implementation::base64_to_binary( result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation}; @@ -22777,6 +22895,9 @@ simdutf_warn_unused result implementation::base64_to_binary( simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -22799,7 +22920,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -22807,7 +22928,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( full_result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -22825,6 +22946,9 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -22847,7 +22971,7 @@ simdutf_warn_unused result implementation::base64_to_binary( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation}; } return {SUCCESS, 0}; @@ -22855,7 +22979,7 @@ simdutf_warn_unused result implementation::base64_to_binary( result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation}; @@ -22867,6 +22991,9 @@ simdutf_warn_unused result implementation::base64_to_binary( simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -22889,7 +23016,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -22897,7 +23024,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( full_result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -26267,7 +26394,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, return (size_t)(out - (uint8_t *)dst) + output_len; } -template +template static inline uint64_t to_base64_mask(block64 *b, uint64_t *error, uint64_t input_mask = UINT64_MAX) { __m512i input = b->chunks[0]; @@ -26309,7 +26436,7 @@ static inline uint64_t to_base64_mask(block64 *b, uint64_t *error, const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1); const __m512i combined = _mm512_or_si512(translated, input); const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask; - if (mask) { + if (!ignore_garbage && mask) { const __mmask64 spaces = _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input), input) & @@ -26390,7 +26517,7 @@ static inline void base64_decode_block(char *out, block64 *b) { base64_decode(out, b->chunks[0]); } -template +template full_result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options, @@ -26402,11 +26529,12 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, srclen; // location of the first padding character if any size_t equalsigns = 0; // skip trailing spaces - while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && + while (!ignore_garbage && srclen > 0 && + scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } - if (srclen > 0 && src[srclen - 1] == '=') { + if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') { equallocation = srclen - 1; srclen--; equalsigns = 1; @@ -26422,7 +26550,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } } if (srclen == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -26442,8 +26570,9 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, load_block(&b, src); src += 64; uint64_t error = 0; - uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { + uint64_t badcharmask = + to_base64_mask(&b, &error); + if (!ignore_garbage && error) { src -= 64; size_t error_offset = _tzcnt_u64(error); return {error_code::INVALID_BASE64_CHARACTER, @@ -26479,8 +26608,9 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, block64 b; load_block_partial(&b, src, input_mask); uint64_t error = 0; - uint64_t badcharmask = to_base64_mask(&b, &error, input_mask); - if (error) { + uint64_t badcharmask = + to_base64_mask(&b, &error, input_mask); + if (!ignore_garbage && error) { size_t error_offset = _tzcnt_u64(error); return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit + error_offset), size_t(dst - dstinit)}; @@ -26513,14 +26643,16 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, 5, 6, 0, 1, 2); const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); - if (last_chunk_options == last_chunk_handling_options::strict && + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict && (idx != 1) && ((idx + equalsigns) & 3) != 0) { // The partial chunk was at src - idx _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), size_t(dst - dstinit)}; - } else if (last_chunk_options == + } else if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::stop_before_partial && (idx != 1) && ((idx + equalsigns) & 3) != 0) { // Rewind src to before partial chunk @@ -26529,7 +26661,8 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, src -= idx; } else { if (idx == 2) { - if (last_chunk_options == last_chunk_handling_options::strict) { + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict) { uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) + (uint32_t(bufferptr[-1]) << 2 * 6); if (triple & 0xffff) { @@ -26544,7 +26677,8 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; } else if (idx == 3) { - if (last_chunk_options == last_chunk_handling_options::strict) { + if (!ignore_garbage && + last_chunk_options == last_chunk_handling_options::strict) { uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) + (uint32_t(bufferptr[-2]) << 2 * 6) + (uint32_t(bufferptr[-1]) << 1 * 6); @@ -26559,7 +26693,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, output_len += 2; _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; - } else if (idx == 1) { + } else if (!ignore_garbage && idx == 1) { _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled); dst += output_len; return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), @@ -26570,7 +26704,8 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } } - if (last_chunk_options != stop_before_partial && equalsigns > 0) { + if (!ignore_garbage && last_chunk_options != stop_before_partial && + equalsigns > 0) { size_t output_count = size_t(dst - dstinit); if ((output_count % 3 == 0) || ((output_count % 3) + 1 + equalsigns != 4)) { @@ -26581,7 +26716,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, return {SUCCESS, srclen, size_t(dst - dstinit)}; } - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; @@ -28141,21 +28276,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( @@ -28166,21 +28325,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::base64_length_from_binary( @@ -31062,7 +31245,7 @@ struct block64 { __m256i chunks[2]; }; -template +template static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { const __m256i ascii_space_tbl = _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, @@ -31144,7 +31327,7 @@ static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { const __m256i chk = _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src); const int mask = _mm256_movemask_epi8(chk); - if (mask) { + if (!ignore_garbage && mask) { __m256i ascii_space = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src); *error = (mask ^ _mm256_movemask_epi8(ascii_space)); @@ -31153,13 +31336,17 @@ static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) { return (uint32_t)mask; } -template +template static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) { uint32_t err0 = 0; uint32_t err1 = 0; - uint64_t m0 = to_base64_mask(&b->chunks[0], &err0); - uint64_t m1 = to_base64_mask(&b->chunks[1], &err1); - *error = err0 | ((uint64_t)err1 << 32); + uint64_t m0 = + to_base64_mask(&b->chunks[0], &err0); + uint64_t m1 = + to_base64_mask(&b->chunks[1], &err1); + if (!ignore_garbage) { + *error = err0 | ((uint64_t)err1 << 32); + } return m0 | (m1 << 32); } @@ -31238,7 +31425,7 @@ static inline void base64_decode_block_safe(char *out, block64 *b) { std::memcpy(out + 24, buffer, 24); } -template +template full_result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options, @@ -31248,12 +31435,13 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, size_t equallocation = srclen; // location of the first padding character if any // skip trailing spaces - while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && + while (!ignore_garbage && srclen > 0 && + scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } size_t equalsigns = 0; - if (srclen > 0 && src[srclen - 1] == '=') { + if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') { equallocation = srclen - 1; srclen--; equalsigns = 1; @@ -31269,7 +31457,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } } if (srclen == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -31292,8 +31480,9 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, load_block(&b, src); src += 64; uint64_t error = 0; - uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { + uint64_t badcharmask = + to_base64_mask(&b, &error); + if (!ignore_garbage && error) { src -= 64; size_t error_offset = _tzcnt_u64(error); return {error_code::INVALID_BASE64_CHARACTER, @@ -31342,7 +31531,8 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (!scalar::base64::is_eight_byte(*src) || val > 64) { + if (!ignore_garbage && + (!scalar::base64::is_eight_byte(*src) || val > 64)) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit)}; } @@ -31388,8 +31578,14 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, // backtrack int leftover = int(bufferptr - buffer_start); while (leftover > 0) { - while (to_base64[uint8_t(*(src - 1))] == 64) { - src--; + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } src--; leftover--; @@ -31405,7 +31601,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } else { r.output_count += size_t(dst - dstinit); } - if (last_chunk_options != stop_before_partial && + if (!ignore_garbage && last_chunk_options != stop_before_partial && r.error == error_code::SUCCESS && equalsigns > 0) { // additional checks if ((r.output_count % 3 == 0) || @@ -31416,7 +31612,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } return r; } - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; @@ -34195,21 +34391,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( @@ -34220,21 +34440,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::base64_length_from_binary( @@ -36030,6 +36274,9 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); // skip trailing spaces while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { @@ -36053,7 +36300,7 @@ simdutf_warn_unused result implementation::base64_to_binary( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation}; } return {SUCCESS, 0}; @@ -36061,7 +36308,7 @@ simdutf_warn_unused result implementation::base64_to_binary( result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation}; @@ -36078,6 +36325,9 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); // skip trailing spaces while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { @@ -36101,7 +36351,7 @@ simdutf_warn_unused result implementation::base64_to_binary( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation}; } return {SUCCESS, 0}; @@ -36109,7 +36359,7 @@ simdutf_warn_unused result implementation::base64_to_binary( result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation}; @@ -37828,6 +38078,9 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -37850,7 +38103,7 @@ simdutf_warn_unused result implementation::base64_to_binary( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation}; } return {SUCCESS, 0}; @@ -37858,7 +38111,7 @@ simdutf_warn_unused result implementation::base64_to_binary( result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation}; @@ -37870,6 +38123,9 @@ simdutf_warn_unused result implementation::base64_to_binary( simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -37892,7 +38148,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -37900,7 +38156,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( full_result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -37918,6 +38174,9 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -37940,7 +38199,7 @@ simdutf_warn_unused result implementation::base64_to_binary( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation}; } return {SUCCESS, 0}; @@ -37948,7 +38207,7 @@ simdutf_warn_unused result implementation::base64_to_binary( result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation}; @@ -37960,6 +38219,9 @@ simdutf_warn_unused result implementation::base64_to_binary( simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { + const bool ignore_garbage = + (options == base64_options::base64_url_accept_garbage) || + (options == base64_options::base64_default_accept_garbage); while (length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; @@ -37982,7 +38244,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( } } if (length == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -37990,7 +38252,7 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details( full_result r = scalar::base64::base64_tail_decode( output, input, length, equalsigns, options, last_chunk_options); if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -40813,7 +41075,7 @@ struct block64 { __m128i chunks[4]; }; -template +template static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) { const __m128i ascii_space_tbl = _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0, @@ -40878,7 +41140,7 @@ static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) { const __m128i chk = _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src); const int mask = _mm_movemask_epi8(chk); - if (mask) { + if (!ignore_garbage && mask) { __m128i ascii_space = _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src); *error = (mask ^ _mm_movemask_epi8(ascii_space)); @@ -40887,18 +41149,24 @@ static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) { return (uint16_t)mask; } -template +template static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) { uint32_t err0 = 0; uint32_t err1 = 0; uint32_t err2 = 0; uint32_t err3 = 0; - uint64_t m0 = to_base64_mask(&b->chunks[0], &err0); - uint64_t m1 = to_base64_mask(&b->chunks[1], &err1); - uint64_t m2 = to_base64_mask(&b->chunks[2], &err2); - uint64_t m3 = to_base64_mask(&b->chunks[3], &err3); - *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) | - ((uint64_t)err3 << 48); + uint64_t m0 = + to_base64_mask(&b->chunks[0], &err0); + uint64_t m1 = + to_base64_mask(&b->chunks[1], &err1); + uint64_t m2 = + to_base64_mask(&b->chunks[2], &err2); + uint64_t m3 = + to_base64_mask(&b->chunks[3], &err3); + if (!ignore_garbage) { + *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) | + ((uint64_t)err3 << 48); + } return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48); } @@ -41011,7 +41279,7 @@ static inline void base64_decode_block_safe(char *out, block64 *b) { std::memcpy(out + 36, buffer, 12); } -template +template full_result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options, @@ -41021,12 +41289,13 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, size_t equallocation = srclen; // location of the first padding character if any // skip trailing spaces - while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && + while (!ignore_garbage && srclen > 0 && + scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } size_t equalsigns = 0; - if (srclen > 0 && src[srclen - 1] == '=') { + if (!ignore_garbage && srclen > 0 && src[srclen - 1] == '=') { equallocation = srclen - 1; srclen--; equalsigns = 1; @@ -41042,7 +41311,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } } if (srclen == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -41065,8 +41334,9 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, load_block(&b, src); src += 64; uint64_t error = 0; - uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { + uint64_t badcharmask = + to_base64_mask(&b, &error); + if (error && !ignore_garbage) { src -= 64; size_t error_offset = simdutf_tzcnt_u64(error); return {error_code::INVALID_BASE64_CHARACTER, @@ -41114,7 +41384,8 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (!scalar::base64::is_eight_byte(*src) || val > 64) { + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit)}; } @@ -41160,8 +41431,14 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, // backtrack int leftover = int(bufferptr - buffer_start); while (leftover > 0) { - while (to_base64[uint8_t(*(src - 1))] == 64) { - src--; + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } src--; leftover--; @@ -41178,7 +41455,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, r.output_count += size_t(dst - dstinit); } if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -41188,7 +41465,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } return r; } - if (equalsigns > 0) { + if (equalsigns > 0 && !ignore_garbage) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; @@ -43976,21 +44253,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( @@ -44001,21 +44302,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::base64_length_from_binary( @@ -46960,7 +47285,7 @@ static inline void base64_decode_block_safe(char *out, block64 *b) { base64_decode_block(out, b); } -template +template full_result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options, @@ -46991,7 +47316,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, } } if (srclen == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -47012,7 +47337,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, bool error = false; uint64_t badcharmask = to_base64_mask(&b, &error); if (badcharmask) { - if (error) { + if (error && !ignore_garbage) { src -= 64; while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { @@ -47056,7 +47381,8 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (!scalar::base64::is_eight_byte(*src) || val > 64) { + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit)}; } @@ -47098,8 +47424,14 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, // backtrack int leftover = int(bufferptr - buffer_start); while (leftover > 0) { - while (to_base64[uint8_t(*(src - 1))] == 64) { - src--; + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } src--; leftover--; @@ -47116,7 +47448,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, r.output_count += size_t(dst - dstinit); } if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -47126,7 +47458,7 @@ compress_decode_base64(char *dst, const char_type *src, size_t srclen, } return r; } - if (equalsigns > 0) { + if (equalsigns > 0 && !ignore_garbage) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; @@ -49864,21 +50196,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( @@ -49889,21 +50245,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::base64_length_from_binary( @@ -53234,7 +53614,7 @@ static inline void base64_decode_block_safe(char *out, block64 *b) { std::memcpy(out + 24, buffer, 24); } -template +template full_result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options, @@ -53265,7 +53645,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } } if (srclen == 0) { - if (equalsigns > 0) { + if (!ignore_garbage && equalsigns > 0) { return {INVALID_BASE64_CHARACTER, equallocation, 0}; } return {SUCCESS, 0, 0}; @@ -53289,7 +53669,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, src += 64; bool error = false; uint64_t badcharmask = to_base64_mask(&b, &error); - if (error) { + if (error && !ignore_garbage) { src -= 64; while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { @@ -53341,7 +53721,8 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (!scalar::base64::is_eight_byte(*src) || val > 64) { + if ((!scalar::base64::is_eight_byte(*src) || val > 64) && + !ignore_garbage) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit), size_t(dst - dstinit)}; } @@ -53387,8 +53768,14 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, // backtrack int leftover = int(bufferptr - buffer_start); while (leftover > 0) { - while (to_base64[uint8_t(*(src - 1))] == 64) { - src--; + if (!ignore_garbage) { + while (to_base64[uint8_t(*(src - 1))] == 64) { + src--; + } + } else { + while (to_base64[uint8_t(*(src - 1))] >= 64) { + src--; + } } src--; leftover--; @@ -53405,7 +53792,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, r.output_count += size_t(dst - dstinit); } if (last_chunk_options != stop_before_partial && - r.error == error_code::SUCCESS && equalsigns > 0) { + r.error == error_code::SUCCESS && equalsigns > 0 && !ignore_garbage) { // additional checks if ((r.output_count % 3 == 0) || ((r.output_count % 3) + 1 + equalsigns != 4)) { @@ -53415,7 +53802,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen, } return r; } - if (equalsigns > 0) { + if (equalsigns > 0 && !ignore_garbage) { if ((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)}; @@ -56273,21 +56660,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( @@ -56298,21 +56709,45 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64( simdutf_warn_unused result implementation::base64_to_binary( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused full_result implementation::base64_to_binary_details( const char16_t *input, size_t length, char *output, base64_options options, last_chunk_handling_options last_chunk_options) const noexcept { - return (options & base64_url) - ? compress_decode_base64(output, input, length, options, - last_chunk_options) - : compress_decode_base64(output, input, length, options, - last_chunk_options); + if (options & base64_url) { + if (options == base64_options::base64_url_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } + } else { + if (options == base64_options::base64_default_accept_garbage) { + return compress_decode_base64(output, input, length, options, + last_chunk_options); + } else { + return compress_decode_base64(output, input, length, + options, last_chunk_options); + } + } } simdutf_warn_unused size_t implementation::base64_length_from_binary( diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h index 2d984f40e7bc3f..383687215b010c 100644 --- a/deps/simdutf/simdutf.h +++ b/deps/simdutf/simdutf.h @@ -1,4 +1,4 @@ -/* auto-generated on 2024-12-10 14:54:53 -0500. Do not edit! */ +/* auto-generated on 2024-12-17 14:54:59 -0500. Do not edit! */ /* begin file include/simdutf.h */ #ifndef SIMDUTF_H #define SIMDUTF_H @@ -675,7 +675,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "5.6.4" +#define SIMDUTF_VERSION "5.7.0" namespace simdutf { enum { @@ -686,11 +686,11 @@ enum { /** * The minor version (major.MINOR.revision) of simdutf being used. */ - SIMDUTF_VERSION_MINOR = 6, + SIMDUTF_VERSION_MINOR = 7, /** * The revision (major.minor.REVISION) of simdutf being used. */ - SIMDUTF_VERSION_REVISION = 4 + SIMDUTF_VERSION_REVISION = 0 }; } // namespace simdutf @@ -2702,6 +2702,9 @@ simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input, size_t length); // base64_options are used to specify the base64 encoding options. +// ASCII spaces are ' ', '\t', '\n', '\r', '\f' +// garbage characters are characters that are not part of the base64 alphabet +// nor ASCII spaces. enum base64_options : uint64_t { base64_default = 0, /* standard base64 format (with padding) */ base64_url = 1, /* base64url format (no padding) */ @@ -2711,6 +2714,10 @@ enum base64_options : uint64_t { base64_reverse_padding, /* standard base64 format without padding */ base64_url_with_padding = base64_url | base64_reverse_padding, /* base64url with padding */ + base64_default_accept_garbage = + 4, /* standard base64 format accepting garbage characters */ + base64_url_accept_garbage = + 5, /* base64url format accepting garbage characters */ }; // last_chunk_handling_options are used to specify the handling of the last